#include <spu_mfcio.h>
#include "sub1.h"
#define BLOCK (2048)
int main(unsigned long long id,
unsigned long long argp, unsigned long long envp) {
data1 vec __attribute__ (aligned(128));
// read parameter
mfc_get(&vec, (unsigned int)argp, sizeof(data1), 1, 0, 0);
mfc_write_tag_mask(0xffffffff);
mfc_read_tag_status_all();// wait for data transfer
float e = 0;
// allocate double buffer in local SPU memory
float *x = (float*)malloc(BLOCK*sizeof(float));
float *x0 = (float*)malloc(BLOCK*sizeof(float));
float *y = (float*)malloc((BLOCK+32)*sizeof(float)) + 1;
float *y0 = (float*)malloc((BLOCK+32)*sizeof(float)) + 1;
// get first buffer
mfc_get(x, &vec.x[vec.n0], BLOCK*sizeof(float), 2, 0, 0);
mfc_get(y-1, &vec.y[vec.n0], (BLOCK+32)*sizeof(float), 3, 0, 0);
mfc_read_tag_status_all();
for (int ib=vec.n0; ib<vec.n1; ib += BLOCK) {
if (ib+BLOCK<vec.n1) {
// get next buffer, fence after put x
mfc_getf(x0, &vec.x[ib+BLOCK], BLOCK*sizeof(float), 2, 0, 0);
mfc_get(y0-1, &vec.y[ib+BLOCK], (BLOCK+32)*sizeof(float), 3, 0, 0);
}
// do computation
float ve[4] = {0, 0, 0, 0};
for (int i=0; iBLOCK; i+=4) {
float* yp = &y[i+1], y0 = &y[i], ym = &y[i-1];
vec_st(vec_madd(
vec_splats(.5),
vec_add(
vec_perm(vec_ld(0,ym), vec_ld(16,ym),
vec_lvsl(0,ym)),
vec_perm(vec_ld(0,yp), vec_ld(16,yp),
vec_lvsl(0,yp))),
vec_splats(0.)),
0, &x[i]);
vec_st(vec_add(
vec_ld(0,&ve[0]),vec_madd(
vec_ld(0,y0),
vec_ld(0,y0),
vec_splats(0.))),
0, &ve[0]);
}
e += ve[0] + ve[1] + ve[2] + ve[3];
mfc_read_tag_status_all(); // wait for data transfer
// put current buffer
mfc_put(x, &vec.x[ib], BLOCK*sizeof(float), 2, 0, 0);
float *t = x; x = x0; x0 = t;
t = y; y = y0; y0 = t; // swap buffers
}
// put reduction value
vec.e = e;
mfc_put(&vec, argp, sizeof(data1), 1, 0, 0);
mfc_read_tag_status_all(); // wait for data transfer
return 0;
}
|