#include <spu_mfcio.h>
#include "sub1.h"
#define BLOCK (2048)
int main(unsigned long long id,
unsigned long long argp, unsigned long long envp) {
data1 vec __attribute__ (aligned(128));
// read parameter
mfc_get(&vec, (unsigned int)argp, sizeof(data1), 1, 0, 0);
mfc_write_tag_mask(0xffffffff);
mfc_read_tag_status_all();// wait for data transfer
float e = 0;
// allocate double buffer in local SPU memory
float *x = (float*)malloc(BLOCK*sizeof(float));
float *x0 = (float*)malloc(BLOCK*sizeof(float));
float *y = (float*)malloc((BLOCK+32)*sizeof(float)) + 1;
float *y0 = (float*)malloc((BLOCK+32)*sizeof(float)) + 1;
// get first buffer
mfc_get(x, &vec.x[vec.n0], BLOCK*sizeof(float), 2, 0, 0);
mfc_get(y-1, &vec.y[vec.n0], (BLOCK+32)*sizeof(float), 3, 0, 0);
mfc_read_tag_status_all();
for (int ib=vec.n0; ib<vec.n1; ib += BLOCK) {
if (ib+BLOCK<vec.n1) {
// get next buffer, fence after put x
mfc_getf(x0, &vec.x[ib+BLOCK], BLOCK*sizeof(float), 2, 0, 0);
mfc_get(y0-1, &vec.y[ib+BLOCK], (BLOCK+32)*sizeof(float), 3, 0, 0);
}
// do computation
float ve[4] = {0, 0, 0, 0};
for (int i=0; iBLOCK; i+=4) {
float half =.5;
_mm_store_ps(&x[i],
_mm_mul_ps(_mm_load1_ps(&half),
_mm_add_ps(_mm_loadu_ps(&y[i+1]),
_mm_loadu_ps(&y[i-1]))));
_mm_store_ps(&ve[0],
_mm_add_ps(_mm_load_ps(&ve[0]),
_mm_mul_ps(_mm_load_ps(&y[i]),
_mm_load_ps(&y[i]))));
}
e += ve[0] + ve[1] + ve[2] + ve[3];
mfc_read_tag_status_all(); // wait for data transfer
// put current buffer
mfc_put(x, &vec.x[ib], BLOCK*sizeof(float), 2, 0, 0);
float *t = x; x = x0; x0 = t;
t = y; y = y0; y0 = t; // swap buffers
}
// put reduction value
vec.e = e;
mfc_put(&vec, argp, sizeof(data1), 1, 0, 0);
mfc_read_tag_status_all(); // wait for data transfer
return 0;
}
|