SGI shmem one-sided communication
Virtual shared memory distributed memory system, e.g. large shared memory computers and distributed memory computers with low latency communication. SGI and Cray computer, e.g. SGI Altix.
A simplified translation of the following example parallel-for loop is given below.
...
Grid1 *g = new Grid1(0, n+1);
Grid1IteratorSub it(1, n, g);
DistArray x(g), y(g);
float e = 0;
...
ForEach(int i, it,
x(i) += ( y(i+1) + y(i-1) )*.5;
e += sqr( y(i) ); )
...
|
main code:
#include <mpp/shmem.h>
...
start_pes(0);
...
int nn = (n-1) / _num_pes();
int n_local0 = 1 + _my_pe() * nn;
int n_local1 = 1 + (_my_pe()+1) * nn;
// allocate only local part + ghost zone of the arrays x,y
float *x = (float*)shmalloc((nn+2)*sizeof(float)) - (n_local0-1);
float *y = (float*)shmalloc((nn+2)*sizeof(float)) - (n_local0-1);
...
// fill ghost zone
shmem_barrier_all();
if (_my_pe() > 0)
shmem_float_get(&y[n_local0-1], &y[n_local1-1], 1, _my_pe()-1);
if (_my_pe() < _num_pes()-1)
shmem_float_get(&y[n_local1], &y[n_local0], 1, _my_pe()+1);
// do computation
float e_local = 0;
for (int i=n_local0; i<n_local1; ++i) {
x[i] += ( y[i+1] + y[i-1] )*.5;
e_local += y[i] * y[i];
}
static float work[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long sync[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static float el, e;
el = e_local;
shmem_float_sum_to_all(&e, &el, 1,
0, 0, _num_pes(), work, sync);
...
shfree(x + (n_local0-1));
shfree(y + (n_local0-1));
|