/** skeletons.c
 *
 *  generic functions for dataparallel skeletons 
 *  map, reduce, prefix
 *
 *  C. Kessler 990604
 */
#include <fork.h>   // always required in Fork programs
#include <io.h>     // printf() prototype 
#include <string.h> // memcpy() prototype


// generic map routine for unary functions:

sync void map( sync sh void (*f)(void *, void *), sh void **x,
               sh void **d, sh int N, sh int elsize )
{
  sh int p = 0;
  int i;

  $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1

  forall (i, 0, N, p )
     f( x + i*elsize, d + i*elsize );
}


sync void oldreduce( sync sh void (*f)(void *, void *, void *),
                  sh void *s, sh void **d, sh int n, sh int elsize )
{
 sh int p = 0; 
 sh void **temp;
 int i;
 sh int t;
 $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
 
 temp = (void **)shalloc( p * elsize );  // temporary array
 farm // initialize temp by the first p elements of d:
   if ($ < n)
     memcpy( temp+$*elsize, d+$*elsize, elsize );
 
 // p partial reductions being computed:
 forall( i, p, n, p ) 
   f( temp + $*elsize, temp + $*elsize, d + i*elsize );

 // iterative computation upwards the tree:
 for (t=1; t<p; t = t<<1 )  // sequential loop over tree levels
   if ($+t < p) 
      f( temp+$*elsize, temp+$*elsize, temp + ($+t)*elsize );
 seq
   memcpy( s, temp, elsize );    // write result to *s
 shallfree();  // release temp
}


/** In-place f-reduction 
 *  for an array d of p items of size elsize each.
 *  Assumes that (at least) p processors are available that are
 *  numbered consecutively from 0 to p-1.
 */
sync void preduce( sync sh void (*f)(void *, void *, void *),
                   sh void **d, sh int elsize )
{
 sh int p = groupsize(); 
 int i;
 sh int t;

 if ( $ >= p )  return;    // need only p processors here
 
 // iterative computation upwards the tree:
 for (t=1; t<p; t = t<<1 )  // sequential loop over tree levels
   if ($+t < p) 
      f( d+$*elsize, d+$*elsize, d + ($+t)*elsize );
}


void seq_reduce( void (*f)(void *, void *, void *),
                      void *s, void **d, int n, int elsize )
{
 int t;

 if (n<=0) return;

 memcpy( s, d, elsize );    // initialize s by d[0]

 for (t=1; t<n; t++) {
    f( s, s, d+t*elsize );
 }
}


sync void reduce( sync void (*f)(void *, void *, void *),
                  sh void *s, sh void **d, sh int n, sh int elsize )
{
 sh int p = 0; 
 sh void **temp;
 int i, ss, myslice;
 sh int t;
 $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
 
 temp = (void **)shalloc( p * elsize );  // temporary array

 if (p < n) {
   // partition the data vector d into p slices
   farm {
     ss = (int)((float)(n+p-1) / (float)(p));
     pprintf("ss=%d\n", ss);
     if ($*ss >= n)          myslice = 0;
     else if (($+1)*ss > n)  myslice = n - $*ss;
     else                    myslice = ss;
     pprintf("myslice=%d\n", myslice);
   }
   // and concurrently do sequential reduction for each slice:
   farm
     seq_reduce( f, temp+$*elsize, d+$*ss*elsize, myslice, elsize );
 }
 else 
   // copy data in parallel to temp array:
   farm
     if ($ < n)
       memcpy( temp+$*elsize, d+$*elsize, elsize );

 preduce( f, temp, elsize );

 seq
   memcpy( s, temp, elsize );    // write result to *s
 shallfree();  // release temp
}



/** In-place iterative parallel f-prefix computation
 *  for a p-element array with p processors.
 *  If non--null, offset is f-combined with every element.
 *  Consecutive numbering of $ in 0..groupsize()-1 is assumed.
 */
sync void pprefix( sync sh void (*f)(void *, void *, void *),
                    sh void **x, sh void *offset, sh int elsize )
{
 sh int p = groupsize(); 
 sh int t;
  
 if (offset)
   if ($==0)
     f( x, offset, x );

 // iterative computation upwards the tree:
 for (t=1; t<p; t = t<<1 )  // sequential loop over tree levels
   if ($ >= t) 
      f( x+$*elsize, x+($-t)*elsize, x+$*elsize );

}

sync void prefix( sync sh void (*f)(void *, void *, void *),
                  sh void **x, sh void **d,
                  sh int n, sh int elsize )
{
 int i;
 sh int t;
 sh void *offset = NULL;
 sh int iter, p = 0; 
 $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
  
 if ($ >= n) return;   // need at most n processors

 farm
   FORALL( i, &iter, 0, n, 1 )
     memcpy( x+i*elsize, d+i*elsize, elsize );    // x[i]=d[i]

 // outer sequential loop over x in slices of size p:
 for (t=0; t<n; t += p ) {
   if ($ < n-t)    // do not exceed bound of x for last slice
     pprefix( f, x+t*elsize, offset, elsize );
   offset = x + (t+p-1)*elsize; // offset for next slice prefix
 }
}


sh simple_lock screen = 0;

// example application for floats:

sync void fadd( void *c, void *a, void *b ) 
 { *(float *)c = *(float *)a + *(float *)b; }

sync void fsqr( void *c, void *a ) 
 { *(float *)c = *(float *)a * *(float *)a; }

struct st { int i; int j; } st1 = {1,2}, st2;

int main(void)
{
  start {
    sh float s, *x, *tmp;
    sh int n = 12;
    int i;
   
    x = (float *)shalloc( n * sizeof(float) );
    tmp = (float *)shalloc( n * sizeof(float) );
    farm
       forall( i, 0, n, __STARTED_PROCS__) 
          x[i] = (float) i;     // some initialization
    seq for(i=0; i<n; i++) printf("x[%d] = %f\n", i, x[i] );
    prefix( fadd, (void **) tmp, (void **)x, n, sizeof(float));
    seq for(i=0; i<n; i++) printf("prefix[%d] = %f\n", i, tmp[i] );
    map( fsqr, (void **) tmp, (void **)x, n, sizeof(float) );
    seq for(i=0; i<n; i++) printf("tmp[%d] = %f\n", i, tmp[i] );
    reduce( fadd, &s, (void **) tmp, n, sizeof(float) );
    seq printf(" %f\n", s );
  }
  simple_lockup(&screen);
    printAccStat();
  simple_unlock(&screen);
  barrier;
}

