分别用 mpi 和 cuda 实现圆周率 pi 的 Lebniz级数计算

突然发现今天是3月14日，3.14，圆周率日，所以准备搞搞新花样，用并行的方式计算一串长长的级数求和。时间所限，所以暂时先搞一个粗糙的版本。这里分别尝试用 mpi 和 cuda 来计算 pi 的级数求和公式，求和项数越多，结果越精确。因为求和的项与项之间没有前后依赖，所以可以并行实现，每个核承担一部分的求和任务。
最简单的方式是用莱布尼兹求和公式 pi / 4 = 1 - 1/3 + 1/5 - 1/7 + 1/9 - 1/11 + …, 这是计算 pi 的最简单的公式（arctan(1.) 的泰勒展开式），但也是收敛速度最慢的，时间所限先将就用着，后面有时间再去用更高级的公式以及用数组实现任意长整数来计算pi的更多位数。
莱布尼兹求和公式每计算 n 项，与精确值之间的误差可以缩小到大概 1/ 2n, 比如如果要计算 128 G （≈ 0.1T ≈ 10^11）项之和，则大概可以使结果精确到第小数点后第11位，我们试试计算 128 G 的项数，看看需要多长时间。

首先是 mpi, 用多个 cpu 并行计算，代码如下：

#include "mpi.h"
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#define ll long long
using namespace std;


int main(int argc, char** argv) {
    
    ll N = 128 * (ll)1 << 30;  // items for summation
    int proc_id, proc_num;
    double local = 0, total = 0, recv;
    MPI_Status status;
    clock_t begT, endT;
    
    begT = clock();
    MPI_Init(&argc, &argv);  // initialize of MPI
    MPI_Comm_size(MPI_COMM_WORLD, &proc_num);  // number of processors
    MPI_Comm_rank(MPI_COMM_WORLD, &proc_id);  // id of this processor

	local = 0;
    /// compute PI by summation of array
    ll batch_size = N / proc_num;
    ll base = proc_id * batch_size;
    for (ll id = batch_size - 1; id > -1; --id) {
    // for (ll i = proc_id; i < N; i += proc_num) {
        ll i = base + id;
        local += ((i & (ll)1)? -1: 1)
            * (double)1. / ((double)2. * i + 1.);
    }
	if (proc_id != 0) {  // send other processors' results to processor id: 0
		printf("\033[35;1m Sending result \033[40;33;1m %15.14f \033[35;1m from \033[32;1m %d \033[35;1m to \033[32;1m 0 \033[0m\n", local, proc_id);
		MPI_Send(&local, 1, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD);
	} else { // processor id0 receives others' results
		for (int i = 1; i < proc_num; ++i) {
			MPI_Recv(&recv, 1, MPI_DOUBLE, i, 1, MPI_COMM_WORLD, &status);
			total += recv;
		}
		total += local;
        endT = clock();
		printf("\033[35;1m >>> parallel, Result: \033[31;1m %15.14f \033[0m\n", total * 4.); 
	}
    MPI_Finalize();  // end of MPI
    
    if(proc_id == 0) printf("\033[32;1m %i cpus, consuming time is \033[40;33;1m%lf\033[0m s\n", 
                            proc_num, (double)(endT - begT) / CLOCKS_PER_SEC);

    /// ============================================================================================ cpu serial version
    if(proc_id == 0) {
        begT = clock();
        total = 0.;
        for (ll i = N - 1; i != -1; --i) {
            total += ((i & (ll)1)? -1: 1)
                * (double)1. / ((double)2. * i + 1.);
        }
        endT = clock();
        printf("\033[35;1m >>> single cpu, Result: \033[31;1m %15.14f \033[0m\n", total * 4.); 
        printf("\033[32;1m 1 cpus, consuming time is \033[40;33;1m%lf\033[0m s\n", (double)(endT - begT) / CLOCKS_PER_SEC);
    }

    return 0;
}

结果如下，基本符合线性的加速比。这里用的服务器有 96 个cpu （Intel? Xeon? Platinum）, 我最多用 95 个核并行，最快的时间是 4.23 秒完成 128G 项，其中每求和一项需要一次乘法一次除法以及两次加法，即4.23秒之内总共完成了4 x 128G 次浮点数运算，说明整个服务器的算力大概是 512 Gflops

cpu个数	1	20	40	60	80	95
计算时间	320 s	17.32 s	8.79 s	6.01 s	5.02 s	4.23 s

然后轮到 cuda,

cuda 调用 gpu 并行计算，核数更加多。
这里的并行策略是，把n项求和分成 a * b 项来求和，比如：

第一步：申请一个长度为 a 的数组，其中每个元素记录 b 个数的求和结果；
第二步：把数组的 a 个元素求和得到最后结果。

其中第二步对数组的a个数求和可以用并行的 reduce sum 规约求和来加速。

实际上由于这里的求和项数 n 超级大，远远大于数组的可用长度a，因此 b >> a, 即上面的第一步占了计算时间的大头，而第二步占用的时间反而比较小。
比如如果要对 128G 项求和，每项用双精度浮点数(8 Byte), 那么总共需要128 * 8GB = 1 TB 的内存，远大于gpu内存（比如Tesla V100 的30G内存），所以不可能直接用数组从头到尾规约求和的方式得到最终结果，所以我这里用上面提到的两步法，首先数组中每个元素对多项进行求和，然后才是数组中的元素求和。
代码如下：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <vector>
#include <iostream>
#include <algorithm>
#include <math.h>
#include <memory>
#include <numeric>
#include <time.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/copy.h>
#define ll long long
using namespace std;


template<typename T> __global__ void 
lebnizPi_kernel(T *vec, ll vecSize, ll numsPerThread) {
    ll tid = blockIdx.x * blockDim.x + threadIdx.x;
    double tmp(0.);
    if (tid < vecSize) {
        ll base(tid * numsPerThread);
        for (ll i = numsPerThread - 1; i > -1; --i) {
            ll idx = base + i;
            tmp += ((idx & (ll)1)? -1: 1) 
                * (double)1. / ((double)2. * idx + 1);
        }
        vec[tid] = tmp;
    }
}


template<typename T> inline T
ceil(T numerator, T denominator) {
    return (numerator + denominator - 1) / denominator;
}


double lebnizPi_cu(ll big) {   
	/*
	 * input, big: the items number for summation
	 * output, pi: computed from Lebniz-sum
	 */
    ll vecSize = min((ll)1 << 16, big);  
    ll numsPerThread = ceil(big, vecSize);
    cout << "\033[32;1m numsPerThread = " << numsPerThread << "\033[0m" << endl;
    cout << "\033[32;1m vecSize = " << vecSize << "\033[0m" << endl;

    cudaDeviceProp devProp;
    cudaGetDeviceProperties(&devProp, 0);
    ll maxBlockSize = devProp.maxThreadsPerBlock;
    ll blockSize = min(maxBlockSize >> 1, vecSize);
    ll gridSize = ceil(vecSize, blockSize);
    cout << "\033[35;1m gridSize = " << gridSize << "\033[0m" << endl;
    cout << "\033[35;1m blockSize = " << blockSize << "\033[0m" << endl;

    double *vec;
    cudaMalloc((void**)&vec, vecSize * sizeof(*vec));
    lebnizPi_kernel <<<gridSize, blockSize>>> (vec, vecSize, numsPerThread);

    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        cerr << "lebnizPi_kernel, CUDA Error: " << cudaGetErrorString(err) << endl;
    } 
    cout << "\033[32;1m now we are after kernel. \033[0m" << endl;

    thrust::device_vector<double> d_vec(vec, vec + vecSize);
    double res = thrust::reduce(d_vec.begin(), d_vec.end());

    cudaFree(vec);
    return res * 4.;
}


double lebnizPi_direct(ll big) {
	/* serially compute pi by cpu */
    double res = 0.;
    for (ll i = big - 1; i > -1; --i) {
        res += ((i & ll(1))? -1: 1) 
            * (double)1. / (double(2.) * i + 1);
    }
    return res * 4.;
}


int main() {
    clock_t begT, endT;

    ll big = 128 * (ll)1 << 30;  // 128 Gi
    double res;

    begT = clock();
    res = lebnizPi_cu(big);
    endT = clock();
    cout << "\033[35;1m    using cuda, pi = "; printf("%15.14f", res);
    cout << ", time = " << (double)(endT - begT) / CLOCKS_PER_SEC << " s \033[0m" << endl;
        
    begT = clock();
    res = lebnizPi_direct(big);
    endT = clock();
    cout << "\033[35;1m serial compute, pi = "; printf("%15.14f", res);
    cout << ", time = " << (double)(endT - begT) / CLOCKS_PER_SEC << " s \033[0m" << endl;
}

其中，

求和的时候，先把比较小的数累加起来然后再加比较大的数，可以降低浮点数的大数吃小数造成的误差。反之如果先加大数，前面一个很大的数加上后面一个很小的数容易导致小的数被吃掉，降低结果精度，所以这里的计算都是先加比较小的数，后加比较大的数。
所以我们这里的串行版本的求和是这样写的：

    for (ll i = big - 1; i > -1; --i) {
        res += ((i & ll(1))? -1: 1) 
            * (double)1. / (double(2.) * i + 1);
    }

另外，求和的时候，每一项都是一正一负相间分布的，导致循环中需要用 if-else 语句判断正负号，但是如果我们隔着两项求和（相当于先累加正数项，再累加负数项），就可以使得循环中每一次的符号都一致，于是避免了n次判断语句的执行，可以得到一定程度的加速。比如 kernel 函数可以这样写：

template<typename T> __global__ void 
lebnizPi_kernel(T *vec, ll vecSize, ll numsPerThread) {
    ll tid = blockIdx.x * blockDim.x + threadIdx.x;
    double tmp(0.);
    if (tid < vecSize) {
        ll base = tid * numsPerThread;
        ll right = base + numsPerThread - 1;
        int sign = (right & ll(1))? -1 : 1;
        
        tmp = 0.;
        for (ll id = right; id >= base; id -= 2) {
            tmp += sign * double(1.) / (double(2.) * id + 1);
        }
        vec[tid] += tmp;

        tmp = 0.; sign *= -1;
        for (ll id = right - 1; id >= base; id -=2) {
            tmp += sign * double(1.) / (double(2.) * id + 1);
        }
        vec[tid] += tmp;
    }
}

cuda 的实验结果与mpi对比如下：

	硬件	计算结果	计算时间	加速比
串行	Intel Plantinum 1 张	3.14159265358 252	320 s	1
mpi	Intel Plantinum 95张	3.14159265358 252	4.43 s	72
cuda	Tesla V100 (流处理器 5120个)	3.14159265358 237	0.71 s	450

可见由于gpu核数多，对于这种求和这种并行度比较高的计算，用cuda可以得到很大的加速比，速度比96核的服务器还要快。（另外，5120个流处理器只达到了750倍的加速比, 可能一方面是因为gpu的主频（1.5GHz）没有cpu高（2.3GHz），另一方面是gpu中的大数组读写造成额外的时间开销）。其实，肯定还有优化的方法可以进一步榨干机器的性能，等以后有空再试试吧，今天暂时先到这里了