Parallel For

SGI shmem one-sided communication

Virtual shared memory distributed memory system, e.g. large shared memory computers and distributed memory computers with low latency communication. SGI and Cray computer, e.g. SGI Altix.

A simplified translation of the following example parallel-for loop is given below.


...

Grid1 *g = new Grid1(0, n+1);
 
Grid1IteratorSub it(1, n, g);
 
DistArray x(g), y(g);
 
float e = 0;
 
...

ForEach(int i, it,
 
  x(i) += ( y(i+1) + y(i-1) )*.5;
 
  e    += sqr( y(i) ); )
 
...

main code:


#include <mpp/shmem.h>

...

start_pes(0);

...

int nn = (n-1) / _num_pes();

int n_local0 = 1 + _my_pe() * nn;

int n_local1 = 1 + (_my_pe()+1) * nn;

// allocate only local part + ghost zone of the arrays x,y

float *x = (float*)shmalloc((nn+2)*sizeof(float)) - (n_local0-1);

float *y = (float*)shmalloc((nn+2)*sizeof(float)) - (n_local0-1);

...

// fill ghost zone

shmem_barrier_all();

if (_my_pe() > 0)

    shmem_float_get(&y[n_local0-1], &y[n_local1-1], 1, _my_pe()-1);

if (_my_pe() < _num_pes()-1)

    shmem_float_get(&y[n_local1], &y[n_local0], 1, _my_pe()+1);

// do computation

float e_local = 0;

for (int i=n_local0; i<n_local1; ++i) {

    x[i] += ( y[i+1] + y[i-1] )*.5;

    e_local += y[i] * y[i];

}

static float work[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];

static long sync[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];

static float el, e;

el = e_local;

shmem_float_sum_to_all(&e, &el, 1,

   0, 0, _num_pes(), work, sync);

...

shfree(x + (n_local0-1));

shfree(y + (n_local0-1));