Cell Broadband Engine Architecture

Currently available in Sony Playstation3 and IBM Cell blade servers.
A host processor (PPU) with global main memory and up to 8 synergistic co-processors (SPU), explicit data transfer by user controlled DMA blocktransfers. Limited local SPU memory (256kb), special SPU library calls.

A simplified translation of the following example parallel-for loop is given below.

Grid1 *g = new Grid1(0, n+1);
Grid1IteratorSub it(1, n, g);
DistArray x(g), y(g);
...
float e = 0;
ForEach(int i, it,
   x(i) += ( y(i+1) + y(i-1) )*.5;
   e += sqr( y(i) ); )
...



a common include file sub1.h for the interface of PPU and SPU:

class data1 {
public:
   data1() {}
   int n0, n1;
   float *x, *y;
   float e;
   int pad[27];// pad to 128 byte length
};


and the host code, starting with global declarations:

#include <libspe2.h>
#include <pthread.h>
#include "sub1.h"
extern spe_program_handle_t spe1; // defined in spe1.cc
data1 vec[8] __attribute__ (aligned(128));
spe_context_ptr_t ctxs[8];
spe_stop_info_t st[8];
float *x, *y;
int nn, pp;


the PPU thread code:

void *sub1(void *arg) {
   p = (int)arg;
   vec[p].n0 = 1 + (((nn * p) / pp) & ~31);
   vec[p].n1 = 1 + (((nn * (p+1)) / pp) & ~31);
   vec[p].x = &x[1];
   vec[p].y = &y[0];
   unsigned int entry = SPE_DEFAULT_ENTRY;
  // execute code on a single SPU and wait for termination
   spe_context_run(ctxs[p], &entry, 0, &vec[p], NULL, st[p]);
   return (void*) 0;
}


and the PPU main code:

x = new float[n+1];
y = new float[n+1];
...
float e = 0;
int spe = 8;
nn = n-1;
pp = p_threads;
threads _Pthread_t[8];
pthread_attr_t attr;
pthread_attr_init(&attr);
// create one thread per SPU for (int p=0; p<spe; ++p) {
   ctxs[p] = spe_context_create(0, NULL);
   spe_program_load (ctxs[p], &spe1);
   pthread_create(&threads[p], attr,
     sub1, (void *)p);
}
for (int p=0; p<spe; ++p) {
   pthread_join(threads[p], NULL);
   spe_context_destroy(ctxs[p]);
   e += vec.e[p];
}
...
delete[] x, y;


and finally the SPU code in a separate file spe1.cc in order to create the symbol spe1:

#include <spu_mfcio.h>
#include "sub1.h"
#define block 2048
int main(unsigned long long id,
     unsigned long long argp, unsigned long long envp) {
   data1 vec __attribute__ (aligned(128));
 // read parameter
   mfc_get(&vec, (unsigned int)argp, sizeof(data1), 1, 0, 0);
   mfc_write_tag_mask(0xffffffff);
   mfc_read_tag_status_all();// wait for data transfer
   float e = 0;
 // allocate double buffer in local SPU memory
   float *x = (float*)malloc(block*sizeof(float));
   float *x0 = (float*)malloc(block*sizeof(float));
   float *y = (float*)malloc((block+32)*sizeof(float)) + 1;
   float *y0 = (float*)malloc((block+32)*sizeof(float)) + 1;
 // get first buffer
   mfc_get(x, &vec.x[vec.n0], block*sizeof(float), 2, 0, 0);
   mfc_get(y-1, &vec.y[vec.n0], (block+32)*sizeof(float), 3, 0, 0);
   mfc_read_tag_status_all();
   for (int ib=vec.n0; ib<vec.n1; ib += block) {
     if (ib+block<vec.n1) {
     // get next buffer, fence after put x
       mfc_getf(x0, &vec.x[ib+block], block*sizeof(float), 2, 0, 0);
       mfc_get(y0-1, &vec.y[ib+block], (block+32)*sizeof(float), 3, 0, 0);
     }
     for (int i=0; i<block; ++i) {
     // do computation
       x[i] += ( y[i+1]) + y[i-1] )*.5;
       e += y[i] * y[i];
     }
     mfc_read_tag_status_all(); // wait for data transfer
   // put current buffer
     mfc_put(x, &vec.x[ib], block*sizeof(float), 2, 0, 0);
     float *t = x; x = x0; x0 = t;
     t = y; y = y0; y0 = t; // swap buffers
   }
 // put reduction value
   vec.e = e;
   mfc_put(&vec, argp, sizeof(data1), 1, 0, 0);
   mfc_read_tag_status_all(); // wait for data transfer
   return 0;
}

Unaligned data (x[1] and y[0] at 128 byte boundaries) and array sizes not a multiple of the double buffer block size require some code modifications. Note that AltiVec SIMD instructions for better performance of the SPU code.