Parallel For

Intel Threading Building Blocks

A C++ library for thread programming, e.g. SMP multi-processor computers, multi-core processors, virtual shared memory computer.
Data layout: A single global memory. Each thread reads global shared data and writes to a private fraction of global data.
A simplified translation of the following example parallel-for loop is given below.


Grid1 *g = new Grid1(0, n+1);
 
Grid1IteratorSub it(1, n, g);
 
DistArray x(g), y(g);

...

float e = 0;
 
ForEach(int i, it,
 
  	x(i) += ( y(i+1) + y(i-1) )*.5;
 
  	e    += sqr( y(i) ); )
 
...

global:


#include "tbb/task_scheduler_init.h"

#include "tbb/blocked_range.h"

#include "tbb/parallel_reduce.h"

#include "tbb/cache_aligned_allocator.h"

using namespace tbb;

thread code:


struct sub1 {

    float ee;

    float *x, *y;

    sub1(float *xx, float *yy) : ee(0), x(xx), y(yy) {}

    sub1(sub1& s, split) { ee = 0; x = s.x; y = s.y; }

    void operator() (const blocked_range<int> & r){

        float e = ee;

        for (int i = r.begin(); i!= r.end(); ++i) {

            x[i] += ( y[i+1] + y[i-1] )*.5;

            e += y[i] * y[i];

        }

        ee = e;

    }

    void join(sub1& s) { ee += s.ee; }

};

main code:


 
  task_scheduler_init init;

...

  float e;

  float *x = cache_aligned_allocator<float>().allocate(n+1);

  float *y = cache_aligned_allocator<float>().allocate(n+1);

...

  sub1 s(x, y);

  parallel_reduce(blocked_range<int>(1, n, 1000), s);

  e = s.ee;

...

  cache_aligned_allocator<float>().deallocate(x, n+1);

  cache_aligned_allocator<float>().deallocate(y, n+1);