| 
#include <mpp/shmem.h>#include <cl.h>
 #include <malloc.h>
 
 
 // kernel
 #define BLOCK (512)
 const char *source =
 "__kernel void sub1(__global float* fx,\
 __global const float* fy,\
 __local float* se, __global float* fe) {\
 const unsigned int t = get_global_id(0);\
 const unsigned int b = get_group_id(0);\
 const unsigned block = 512;\
 const unsigned int i = block*b+t;\
 float e;\
 /* do computation */\
 fx[t] += ( fy[t+2] + fy[t] )*.5;\
 e = fy[t+1] * fy[t+1];\
 /* reduction */\
 se[t] = e;\
 barrier(CLK_LOCAL_MEM_FENCE);\
 if (t<256) {\
 se[t] += se[t+256];\
 barrier(CLK_LOCAL_MEM_FENCE);\
 }\
 if (t<128) {\
 se[t] += se[t+128];\
 barrier(CLK_LOCAL_MEM_FENCE);\
 }\
 if (t<64) {\
 se[t] += se[t+64];\
 barrier(CLK_LOCAL_MEM_FENCE);\
 }\
 if (t<32) {\
 se[t] += se[t+32];\
 se[t] += se[t+16];\
 se[t] += se[t+8];\
 se[t] += se[t+4];\
 se[t] += se[t+2];\
 se[t] += se[t+1];\
 }\
 if (t==0)\
 fe[b] = se[0];\
 }";
 
 int main(int argc, char *argv[]) {
 int n = ...;
 start_pes(0);
 int nn = (n-1) / _num_pes();
 int n_local0 = 1 + _my_pe() * nn;
 int n_local1 = 1 + (_my_pe()+1) * nn;
 // allocate only local part + ghost zone of the arrays x,y
 float *x, *y;
 x = (float*)shmalloc((n_local1 - n_local0 + 2)*sizeof(float));
 y = (float*)shmalloc((n_local1 - n_local0 + 2)*sizeof(float));
 x -= (n_local0 - 1);
 y -= (n_local0 - 1);
 shmem_barrier_all();
 
 ... // fill x, y
 
 // fill ghost zone
 if (_my_pe() > 0)
 shmem_float_put(&y[n_local1], &y[n_local0], 1, _my_pe()-1);
 if (_my_pe() < _num_pes()-1)
 shmem_float_put(&y[n_local0-1], &y[n_local1-1], 1, _my_pe()+1);
 shmem_barrier_all();
 
 // allocate GPU
 cl_context ct = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);
 size_t ctsize;
 clGetContextInfo(ct, CL_CONTEXT_DEVICES, 0, 0, &ctsize);
 cl_device_id *aDevices = (cl_device_id*)malloc(ctsize);
 clGetContextInfo(ct, CL_CONTEXT_DEVICES, ctsize, aDevices, 0);
 // compile kernel
 cl_program prog = clCreateProgramWithSource(ct, 1, &source, 0, 0);
 clBuildProgram(prog, 0, 0, 0, 0, 0);
 cl_kernel kern = clCreateKernel(prog, "sub1", 0);
 float e = 0;
 cl_command_queue queue = clCreateCommandQueue(ct, aDevices[0], 0, 0);
 // allocate GPU memory
 cl_mem fx = clCreateBuffer(ct, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 (n_local1-n_local0)*sizeof(cl_float), &x[n_local0], 0);
 cl_mem fy = clCreateBuffer(ct, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
 (n_local1-n_local0+2)*sizeof(cl_float), &y[n_local0-1], 0);
 cl_mem se = clCreateBuffer(ct, CL_MEM_READ_WRITE,
 BLOCK*sizeof(cl_float), 0, 0);
 cl_mem fe = clCreateBuffer(ct, CL_MEM_WRITE_ONLY,
 (n_local1-n_local0)/BLOCK*sizeof(cl_float), 0, 0);
 clSetKernelArg(kern, 0, sizeof(cl_mem), (void *)&fx);
 clSetKernelArg(kern, 1, sizeof(cl_mem), (void *)&fx);
 clSetKernelArg(kern, 2, sizeof(cl_mem), (void *)&se);
 clSetKernelArg(kern, 3, sizeof(cl_mem), (void *)&fe);
 float *d = new float[(n_local1-n_local0)/BLOCK];
 // call GPU
 const unsigned int size = BLOCK;
 const unsigned int dim = n_local1-n_local0+2;
 clEnqueueNDRangeKernel(queue, kern, 1, 0, &dim, &size, 0, 0, 0);
 // copy to host memory
 clEnqueueReadBuffer(queue, fx, CL_TRUE, 0,
 (n_local1-n_local0) * sizeof(cl_float), &x[n_local0], 0, 0, 0);
 clEnqueueReadBuffer(queue, fe, CL_TRUE, 0,
 (n_local1-n_local0) * sizeof(cl_float), d, 0, 0, 0);
 float ee = 0;
 for (int i=0; i<(n_local1-n_local0)/BLOCK; ++i)
 ee += d[i];
 e += ee;
 delete[] d;
 // release GPU memory
 clReleaseMemObject(fx);
 clReleaseMemObject(fy);
 clReleaseMemObject(se);
 clReleaseMemObject(fe);
 
 static float work[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 static long sync[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 static float el, es;
 el = e;
 shmem_float_sum_to_all(&es, &el, 1,
 0, 0, _num_pes(), work, sync);
 e = es;
 
 ... // output x, e
 
 x += (n_local0 - 1);
 y += (n_local0 - 1);
 shfree(x);
 shfree(y);
 return 0;
 }
 |