Parallel For

Cell Broadband Engine Architecture

Currently available in Sony Playstation3 and IBM Cell blade servers.
A host processor (PPU) with global main memory and up to 8 synergistic co-processors (SPU), explicit data transfer by user controlled DMA blocktransfers. Limited local SPU memory (256kb), special SPU library calls.

A simplified translation of the following example parallel-for loop is given below.


Grid1 *g = new Grid1(0, n+1);
 
Grid1IteratorSub it(1, n, g);
 
DistArray x(g), y(g);
 
...

float e = 0;
 
ForEach(int i, it,
 
  	x(i) += ( y(i+1) + y(i-1) )*.5;
 
  	e    += sqr( y(i) ); )
 
...

a common include file sub1.h for the interface of PPU and SPU:


class data1 {

public:

    data1() {}

    int n0, n1;

    float *x, *y;

    float e;

    int pad[27];// pad to 128 byte length

};

and the host code, starting with global declarations:


#include <libspe2.h>

#include <pthread.h>

#include "sub1.h"

extern spe_program_handle_t spe1; // defined in spe1.cc

data1 vec[8] __attribute__ (aligned(128));

spe_context_ptr_t ctxs[8];

spe_stop_info_t st[8];

float *x, *y;

int nn, pp;

the PPU thread code:


void *sub1(void *arg) {

    p = (int)arg;

    vec[p].n0 = 1 + (((nn * p) / pp) & ~31);

    vec[p].n1 = 1 + (((nn * (p+1)) / pp) & ~31);

    vec[p].x = &x[1];

    vec[p].y = &y[0];

    unsigned int entry = SPE_DEFAULT_ENTRY;

  // execute code on a single SPU and wait for termination

    spe_context_run(ctxs[p], &entry, 0, &vec[p], NULL, st[p]);

    return (void*) 0;

}

and the PPU main code:


x = new float[n+1];

y = new float[n+1];

...

float e = 0;
 
int spe = 8;

nn = n-1;

pp = p_threads;

threads _Pthread_t[8];

pthread_attr_t attr;

pthread_attr_init(&attr);

// create one thread per SPU
for (int p=0; p<spe; ++p) {

    ctxs[p] = spe_context_create(0, NULL);

    spe_program_load (ctxs[p], &spe1);

    pthread_create(&threads[p], attr,

      sub1, (void *)p);

}

for (int p=0; p<spe; ++p) {

    pthread_join(threads[p], NULL);

    spe_context_destroy(ctxs[p]);

    e += vec.e[p];

}

...

delete[] x, y;

and finally the SPU code in a separate file spe1.cc in order to create the symbol spe1:


#include <spu_mfcio.h>

#include "sub1.h"

#define block 2048

int main(unsigned long long id,

             unsigned long long  argp, unsigned long long  envp) {

    data1 vec  __attribute__ (aligned(128));

 // read parameter

    mfc_get(&vec, (unsigned int)argp, sizeof(data1), 1, 0, 0);

    mfc_write_tag_mask(0xffffffff);

    mfc_read_tag_status_all();// wait for data transfer

    float e = 0;

 // allocate double buffer in local SPU memory

    float *x = (float*)malloc(block*sizeof(float));

    float *x0 = (float*)malloc(block*sizeof(float));

    float *y = (float*)malloc((block+32)*sizeof(float)) + 1;

    float *y0 = (float*)malloc((block+32)*sizeof(float)) + 1;


 // get first buffer 

    mfc_get(x, &vec.x[vec.n0], block*sizeof(float), 2, 0, 0);

    mfc_get(y-1, &vec.y[vec.n0], (block+32)*sizeof(float), 3, 0, 0);

    mfc_read_tag_status_all();

    for (int ib=vec.n0; ib<vec.n1; ib += block) {

        if (ib+block<vec.n1) {

     // get next buffer, fence after put x

        mfc_getf(x0, &vec.x[ib+block], block*sizeof(float), 2, 0, 0);

        mfc_get(y0-1, &vec.y[ib+block], (block+32)*sizeof(float), 3, 0, 0);

        }

        for (int i=0; i<block; ++i) {

     // do computation

          x[i] += ( y[i+1]) + y[i-1] )*.5;

          e += y[i] * y[i];

        }

        mfc_read_tag_status_all(); // wait for data transfer

   // put current buffer

        mfc_put(x, &vec.x[ib], block*sizeof(float), 2, 0, 0);

        float *t = x; x = x0; x0 = t;

        t = y; y = y0; y0 = t; // swap buffers

    }

 // put reduction value 

    vec.e = e;

    mfc_put(&vec, argp, sizeof(data1), 1, 0, 0);

    mfc_read_tag_status_all(); // wait for data transfer

    return 0;

}

Unaligned data (x[1] and y[0] at 128 byte boundaries) and array sizes not a multiple of the double buffer block size require some code modifications. Note that AltiVec SIMD instructions for better performance of the SPU code.