Parallel For

cluster	SMP & multi-core	CPU off-loading	vectorization
uniform memory	single thread	CPU	scalar
MPI	POSIX threads	Cell BE	SSE
PVM	OpenMP	Cuda	AltiVec
MPI-2	boost threads	OpenCL
shmem	TBB	Brook+
DCMF	HMPP
UPC	RapidMind
parallel-for	Ct

code example
OpenMP multi-threading
Cell Cell Broadband Engine
file sub1.h: common interface


 class data1 {

public:

    data1() {}

    int n0, n1;

    float *x, *y;

    float e;

    int pad[27];// pad to 128 byte length

};

file spe1.cc: SPU code


 #include <spu_mfcio.h>

#include "sub1.h"

#define BLOCK (2048)

int main(unsigned long long id,

             unsigned long long  argp, unsigned long long  envp) {

  data1 vec  __attribute__ (aligned(128));

  // read parameter

  mfc_get(&vec, (unsigned int)argp, sizeof(data1), 1, 0, 0);

  mfc_write_tag_mask(0xffffffff);

  mfc_read_tag_status_all();// wait for data transfer

  float e = 0;

  // allocate double buffer in local SPU memory

  float *x = (float*)malloc(BLOCK*sizeof(float));

  float *x0 = (float*)malloc(BLOCK*sizeof(float));

  float *y = (float*)malloc((BLOCK+32)*sizeof(float)) + 1;

  float *y0 = (float*)malloc((BLOCK+32)*sizeof(float)) + 1;

  // get first buffer

  mfc_get(x, &vec.x[vec.n0], BLOCK*sizeof(float), 2, 0, 0);

  mfc_get(y-1, &vec.y[vec.n0], (BLOCK+32)*sizeof(float), 3, 0, 0);

  mfc_read_tag_status_all();

  for (int ib=vec.n0; ib<vec.n1; ib += BLOCK) {

      if (ib+BLOCK<vec.n1) {

       // get next buffer, fence after put x

       mfc_getf(x0, &vec.x[ib+BLOCK], BLOCK*sizeof(float), 2, 0, 0);

       mfc_get(y0-1, &vec.y[ib+BLOCK], (BLOCK+32)*sizeof(float), 3, 0, 0);

      }

  // do computation

  for (int i=0; iBLOCK; ++i) {
 
    x[i] += ( y[i+1] + y[i-1] )*.5;
 
    e    += y[i] * y[i];
 
  }


      mfc_read_tag_status_all(); // wait for data transfer

      // put current buffer

      mfc_put(x, &vec.x[ib], BLOCK*sizeof(float), 2, 0, 0);

      float *t = x; x = x0; x0 = t;

      t = y; y = y0; y0 = t; // swap buffers

  }

  // put reduction value

  vec.e = e;

  mfc_put(&vec, argp, sizeof(data1), 1, 0, 0);

  mfc_read_tag_status_all(); // wait for data transfer

  return 0;

}

main file



#include <omp.h>

#include <libspe2.h>

#include "sub1.h"






float e;
    
 



 
 

extern spe_program_handle_t spe1; // defined in SPU code
 
 
 
 
 
 
 


int main(int argc, char *argv[]) {


  int n = ...;

 
 
 
 
 
 
 
 
  
float *x, *y;

 

  x = new float[n+1];

  y = new float[n+1];

 
 
 
 



  ... // fill x, y

 


 
 
 
 
 
 
 
 

  e = 0;
 
  #pragma omp parallel

  {

  int p = omp_get_thread_num();

  int num = omp_get_num_threads();

 


  data1 block __attribute__ (aligned(128));

  spe_context_ptr_t ctxs;

  spe_stop_info_t st;

  ctxs = spe_context_create(0, NULL);

  spe_program_load (ctxs, &spe1);

  block.n0 = 1;

  block.n1 = n;

  block.x = &x[1];

  block.y = &y[1-1];

  unsigned int entry = SPE_DEFAULT_ENTRY;

 
  int n0 = 1+((n-1)*p)/num;

  int n1 = 1+((n-1)*(p+1))/num;


  // execute code on a single SPU and wait for termination

  spe_context_run(ctxs[p], &entry, 0, &block, NULL, st);

  spe_context_destroy(ctxs);


  #pragma omp atomic


  e += block.e;

   }

      


 
 
 
 
 

  ... // output x, e





 

  delete[] x, y;

 
  
  return 0;

}

[start] [references] [download] [install]