Parallel For

NVIDIA Cuda GPU

Cuda is a C (and partial C++) SIMD programming model for numerical computations on the Nvidia GeForce graphics processing units (GPU). A mixed SIMD (warps)/ multi-thread (blocks) style with access to device memory and local memory shared by a warp. Data transfer to and from device is initiated by the host.

A simplified translation of the following example parallel-for loop is given below.


Grid1 *g = new Grid1(0, n+1);
 
Grid1IteratorSub it(1, n, g);
 
DistArray x(g), y(g);
 
...

float e = 0;
 
ForEach(int i, it,
 
  	x(i) += ( y(i+1) + y(i-1) )*.5;
 
  	e    += sqr( y(i) ); )
 
...

thread code:


#include <cuda.h>

__global__ void sub1(float* fx, float* fy, float* fe) {

  int t = threadIdx.x; // builtin

  int b = blockIdx.x; // builtin

  float e;

  __shared__ float se[512];

  __shared__ float sx[512];

  __shared__ float sy[512+2];

// copy from device to processor memory

  sx[t] = fx[512*b+t];

  sy[t] = fy[512*b+t];

  if (t<2)

      sy[t+512] = fy[512*b+t+512];

  __syncthreads();

// do computation

  sx[t] += ( sy[t+2] + sy[t] )*.5;

  e = sqr( sx[t] * sx[t] );

// copy to device memory

  fx[512*b+t] = sx[t];

// reduction

  se[t] = e;

  __syncthreads();

  if (t<256) {

     se[t] += se[t+256];

     __syncthreads();

  }

  if (t<128) {

     se[t] += se[t+128];

     __syncthreads();

  }

  if (t<64) {

     se[t] += se[t+64];

     __syncthreads();

  }

  if (t<32) { // warp size 

     se[t] += se[t+32];

     se[t] += se[t+16];

     se[t] += se[t+8];

     se[t] += se[t+4];

     se[t] += se[t+2];

     se[t] += se[t+1];

  }

  if (t==0)

       fe[b] = se[0];

}

main code:


float *x = new float[n+1];

float *y = new float[n+1];

...

float e = 0;
 
// allocate GPU memory

float *fx, *fy, *fe;

cudaMalloc((void**)&fx, (n+1) * sizeof(float));

cudaMalloc((void**)&fy, (n+1) * sizeof(float));

cudaMalloc((void**)&fe, (n+1)/512 * sizeof(float));

float *de = new float[(n+1)/512];

// copy to GPU memory

cudaMemcpy(fx+1, &x[1], (n-1) * sizeof(float),

      cudaMemcpyHostToDevice);

cudaMemcpy(fy, &y[0], (n+1) * sizeof(float),

      cudaMemcpyHostToDevice);

dim3 dimBlock(512, 1, 1);

dim3 dimGrid((n+1)/512, 1, 1);

// call GPU

sub1<<<dimGrid, dimBlock>>>(fx, fy, fe);

// copy to host memory

cudaMemcpy(fx+1, &x[1], (n-1) * sizeof(float),

      cudaMemcpyDeviceToHost);

cudaMemcpy(fe, &de[0], (n+1)/512 * sizeof(float),

      cudaMemcpyDeviceToHost);

// release GPU memory

cudaFree(fe);

cudaFree(fy);

cudaFree(fx);

// final reduction

for (int i=0; i<(n+1)/512; ++i)

      e += de[i];

delete[] de;

...

delete[] x, y;

Non-multiples of 512 and arrays larger than the device memory require some code modifications.