9.6 多线程多GPU方案

CUDA自始至终支持多个GPU，但在CUDA 4.0之前，每个GPU都必须由单独的CPU线程控制。对于需要大量CPU计算能力的工作负载，这一要求并不过分。因为只有通过多线程，现代多核处理器的全部潜力才可以释放。

N-体问题的多线程多GPU实现为每个GPU创建一个CPU线程，它在执行给定的N-体途径时，委托每个线程来分派和同步工作任务。主线程在GPU之间均匀分割工作任务，并通过发送一个事件信号，委托每个工作线程执行任务（或在POSIX平台（如Linux）的一个信号量），然后等待所有工作线程均发出完成的信号才能进行下一步。随着GPU数量的增长，同步的代价会受益于并行性，而逐步减少。

这里的N-体实现，采用了与14.9节中“多线程实现”相同的多线程库。附录A中所描述的WorkerThread类，使应用程序线程把工作任务委托给CPU线程，然后在工作线程完成时对委托任务进行同步。

代码清单9-5给出了创建和初始化CPU线程的主机代码。有两个全局变量g_numGPUs和g_GPUThreadPool，分别保存GPU的数量和每一个GPU的工作线程。每个CPU线程被创建后，它以同步的方式调用

initializeGPU()函数进行初始化，它将为CPU线程分配一个GPU，并且这一分配在应用程序的整个执行过程中都不会改变。

代码清单9-5 多线程多GPU方案的初始化代码

workerThread \*g_CPUThreadPool; int g_numCPUcores;   
workerThread \*g_GPUThreadPool; int g_numGPUs;   
struct gpuInit_struct { int iGPU; cudaError_t status; } ;   
void initializeGPU(void \*p) { cudaError_t status; gpuInit_struct \*p = (gpuInit_struct \*) p; CUDA_CHECK(udaSetDevice(p->iGPU)); CUDA_CHECK(udaSetDeviceFlags(cudaDeviceMapHost)); CUDA_CHECK(udaFree(o)); Error: p->status  $=$  status;   
}   
//...below is from main() if(g_numGPUs){ chCommandLineGet(&g_numGPUs,"numgpus",argc,argv); g_GPUThreadPool  $=$  new workerThread[g_numGPUs]; for(size_t i=0;i<g_numGPUs;i++) { if(!g_GPUThreadPool[i].initialize()){ fprintf(stderr, "Error initializing thread pool\n"); return 1; } for(int i=0;i<g_numGPUs;i++) { gpuInit_struct initGPU  $=$  {i}; g_GPUThreadPool[i].delegateSynchronous( initializeGPU, &initGPU); if(cudaSuccess != initGPU.status){ fprintf(stderr, "Initializing GPU%d failed " with %d (%s)\n", i, initGPU.status, CUDAGetErrorString(initGPU.status)); return 1; }

一旦工作线程初始化完成，它们将挂起并等待一个线程同步原语，直到应用程序线程给它们分派工作。代码清单9-6显示了分派工作给GPU的主机代码：gpuDelegation结构封装了一个给定GPU必须做的工作；对于每个由代码清单9-5中代码创建的工作线程均启动gpuWorkerThread函数。代码清单9-7中显示的应用程序线程代码，为每个工作线程创建一个gpuDelegation结构并调用delegateAsynchronous()方法以启动代码清单9-6中代码。WaitAll()方法将等待所有工作线程的完成。关于基于单线程和多线程的多GPU版本的N-体解决方案的性能和可扩展性，总结在14.7节。

代码清单9-6 主机代码（工作线程）

structgpuDelegation{ size_t i; //base offset for this thread to process size_t n; //size of this thread's problem size_t N; //total number of bodies float \*hostPosMass; float \*hostForce; float softeningSquared; JudaError_t status; }；   
void   
gpuWorkerThread(void\*p) {udaError_tstatus;gpuDelegation  $^{\star}\mathbb{P} =$  (gpuDelegation \*)_p; float \*dptrPosMass  $= 0$  float \*dptrForce  $= 0$  ： // Each GPU has its own device pointer to the host pointer. // CUDA_CHECK(udaMalloc(&dptrPosMass,4\*p->N\*sizeof(float））); CUDA_CHECK(udaMalloc(&dptrForce，3\*p->n\*sizeof(float））); CUDA_CHECK(udaMemcpyAsyncdptrPosMass,p->hostPosMass,4\*p->N\*sizeof(float)，udaMemcpyHostToDevice））; ComputeNBodyGravitationmultiGPU<<300,256,256\*sizeof(float4)>>>（ dptrForce,

代码清单9-7 主机代码（应用线程）

dptrPosMass, p->softeningSquared, p->i, p->n, p->N); // NOTE: synchronous memcpy, so no need for further // synchronization with device CUDART_CHECK(udaMemcpy(p->hostForce+3*p->i, dptrForce, 3*p->n*sizeof(float),udaMemcpyDeviceToHost)); Error:udaFree(dptrPosMass);udaFree(dptrForce);p->status = status; }

float   
ComputeGravitation-multiGPU_threaded( float \*force, float \*posMass, float softeningSquared, size_t N   
} chTimerTimestamp start, end; chTimerGetTime( &start ); { gpuDelegation \*pgpu  $=$  new gpuDelegation[g_numGPUs]; size_t bodiesPerGPU  $= \mathrm{N}$  /g_numGPUs; if(N&g_numGPUs){ return 0.of; } size_t i; for(i = 0;i < g_numGPUs;i++) { pgpu[i].hostPosMass  $=$  g_hostAOS_PosMass; pgpu[i].hostForce  $=$  g_hostAOS_Force; pgpu[i].softeningSquared  $=$  softeningSquared; pgpu[i].i  $=$  bodiesPerGPU\*i; pgpu[i].n  $=$  bodiesPerGPU; pgpu[i].N  $=$  N; g_GPUThreadPool[i].delegateAsynchronous(gpuWorkerThread, &pgpu[i]); } workerThread::waitAll(g_GPUThreadPool,g_numGPUs); delete[] pgpu;   
} chTimerGetTime(&end); return chTimerElapsedTime(&start,&end）\*1000.of;

9.6_多线程多GPU方案

9.6 多线程多GPU方案