Cell Broadband Engine Architecture
Currently available in Sony Playstation3 and IBM Cell blade servers.
A host processor (PPU) with global main memory and up to 8 synergistic
co-processors (SPU), explicit data transfer by user controlled DMA
blocktransfers. Limited local SPU memory (256kb), special SPU library
calls.
A simplified translation of the following example parallel-for loop is given below.
Grid1 *g = new Grid1(0, n+1);
Grid1IteratorSub it(1, n, g);
DistArray x(g), y(g);
...
float e = 0;
ForEach(int i, it,
x(i) += ( y(i+1) + y(i-1) )*.5;
e += sqr( y(i) ); )
...
|
a common
include file sub1.h for the interface of PPU and SPU:
class data1 {
public:
data1() {}
int n0, n1;
float *x, *y;
float e;
int pad[27];// pad to 128 byte length
};
|
and the host code, starting with
global declarations:
#include <libspe2.h>
#include <pthread.h>
#include "sub1.h"
extern spe_program_handle_t spe1; // defined in spe1.cc
data1 vec[8] __attribute__ (aligned(128));
spe_context_ptr_t ctxs[8];
spe_stop_info_t st[8];
float *x, *y;
int nn, pp;
|
the
PPU thread code:
void *sub1(void *arg) {
p = (int)arg;
vec[p].n0 = 1 + (((nn * p) / pp) & ~31);
vec[p].n1 = 1 + (((nn * (p+1)) / pp) & ~31);
vec[p].x = &x[1];
vec[p].y = &y[0];
unsigned int entry = SPE_DEFAULT_ENTRY;
// execute code on a single SPU and wait for termination
spe_context_run(ctxs[p], &entry, 0, &vec[p], NULL, st[p]);
return (void*) 0;
}
|
and the
PPU main code:
x = new float[n+1];
y = new float[n+1];
...
float e = 0;
int spe = 8;
nn = n-1;
pp = p_threads;
threads _Pthread_t[8];
pthread_attr_t attr;
pthread_attr_init(&attr);
// create one thread per SPU
for (int p=0; p<spe; ++p) {
ctxs[p] = spe_context_create(0, NULL);
spe_program_load (ctxs[p], &spe1);
pthread_create(&threads[p], attr,
sub1, (void *)p);
}
for (int p=0; p<spe; ++p) {
pthread_join(threads[p], NULL);
spe_context_destroy(ctxs[p]);
e += vec.e[p];
}
...
delete[] x, y;
|
and finally the
SPU code in a separate file spe1.cc in order to create the symbol spe1:
#include <spu_mfcio.h>
#include "sub1.h"
#define block 2048
int main(unsigned long long id,
unsigned long long argp, unsigned long long envp) {
data1 vec __attribute__ (aligned(128));
// read parameter
mfc_get(&vec, (unsigned int)argp, sizeof(data1), 1, 0, 0);
mfc_write_tag_mask(0xffffffff);
mfc_read_tag_status_all();// wait for data transfer
float e = 0;
// allocate double buffer in local SPU memory
float *x = (float*)malloc(block*sizeof(float));
float *x0 = (float*)malloc(block*sizeof(float));
float *y = (float*)malloc((block+32)*sizeof(float)) + 1;
float *y0 = (float*)malloc((block+32)*sizeof(float)) + 1;
// get first buffer
mfc_get(x, &vec.x[vec.n0], block*sizeof(float), 2, 0, 0);
mfc_get(y-1, &vec.y[vec.n0], (block+32)*sizeof(float), 3, 0, 0);
mfc_read_tag_status_all();
for (int ib=vec.n0; ib<vec.n1; ib += block) {
if (ib+block<vec.n1) {
// get next buffer, fence after put x
mfc_getf(x0, &vec.x[ib+block], block*sizeof(float), 2, 0, 0);
mfc_get(y0-1, &vec.y[ib+block], (block+32)*sizeof(float), 3, 0, 0);
}
for (int i=0; i<block; ++i) {
// do computation
x[i] += ( y[i+1]) + y[i-1] )*.5;
e += y[i] * y[i];
}
mfc_read_tag_status_all(); // wait for data transfer
// put current buffer
mfc_put(x, &vec.x[ib], block*sizeof(float), 2, 0, 0);
float *t = x; x = x0; x0 = t;
t = y; y = y0; y0 = t; // swap buffers
}
// put reduction value
vec.e = e;
mfc_put(&vec, argp, sizeof(data1), 1, 0, 0);
mfc_read_tag_status_all(); // wait for data transfer
return 0;
}
|
Unaligned data (x[1] and y[0] at 128 byte boundaries) and array sizes not
a multiple of the double buffer block size require some code modifications. Note
that
AltiVec SIMD instructions for better
performance of the SPU code.