Intel Threading Building Blocks

A C++ library for thread programming, e.g. SMP multi-processor computers, multi-core processors, virtual shared memory computer.
Data layout: A single global memory. Each thread reads global shared data and writes to a private fraction of global data.
A simplified translation of the following example parallel-for loop is given below.

Grid1 *g = new Grid1(0, n+1);
Grid1IteratorSub it(1, n, g);
DistArray x(g), y(g);
...
float e = 0;
ForEach(int i, it,
   x(i) += ( y(i+1) + y(i-1) )*.5;
   e += sqr( y(i) ); )
...



global:

#include "tbb/task_scheduler_init.h"
#include "tbb/blocked_range.h"
#include "tbb/parallel_reduce.h"
#include "tbb/cache_aligned_allocator.h"
using namespace tbb;



thread code:

struct sub1 {
   float ee;
   float *x, *y;
   sub1(float *xx, float *yy) : ee(0), x(xx), y(yy) {}
   sub1(sub1& s, split) { ee = 0; x = s.x; y = s.y; }
   void operator() (const blocked_range<int> & r){
     float e = ee;
     for (int i = r.begin(); i!= r.end(); ++i) {
       x[i] += ( y[i+1] + y[i-1] )*.5;
       e += y[i] * y[i];
     }
     ee = e;
   }
   void join(sub1& s) { ee += s.ee; }
};



main code:

task_scheduler_init init;
...
float e;
float *x = cache_aligned_allocator<float>().allocate(n+1);
float *y = cache_aligned_allocator<float>().allocate(n+1);
...
sub1 s(x, y);
parallel_reduce(blocked_range<int>(1, n, 1000), s);
e = s.ee;
...
cache_aligned_allocator<float>().deallocate(x, n+1);
cache_aligned_allocator<float>().deallocate(y, n+1);