/** skeletons.c
 *
 *  generic functions for dataparallel skeletons 
 *  map, reduce, prefix
 *
 *  C. Kessler 990604
 */
#include <fork.h>   // always required in Fork programs
#include <io.h>     // printf() prototype 
#include <string.h> // memcpy() prototype
#include <assert.h>


/* included: Matrix.c   by C.W.Kessler 07/99 */

typedef struct { 
  int nrows;       // number of rows
  int ncols;       // number of columns (or max. #cols if sparse)
  float **row;     // array of pointers to first element of each row
  float *data;     // points to linearized array holding all elements
#ifdef SPARSE
  int *n;          // array of sizes of #nonzeros per row (if sparse) 
  int nz;          // number of stored nonzero elements (sparse)
  int *col;        // array of column indices for the nonzeros (sparse)
#endif
} matrix, *Matrix;

// constructors:
extern sync Matrix new_Matrix( sh int nr, sh int nc );
//extern sync Matrix copyMatrix( sh Matrix M );
extern sync Matrix *splitMatrix( sh Matrix M, int x, int y );
// destructor:
extern void freeMatrix( Matrix M );
// output:
extern void printMatrix( Matrix M );
// accessing single matrix elements:
//macro for void Set( Matrix M, int i, int j, float value ):
#define Set( M, i, j, value ) \
           (M)->row[i][j] = (value)
// macro for float Get( Matrix M, int i, int j ):
#define Get( M, i, j ) \
           ((M)->row[i][j])
// arithmetics:
extern sync void Add( sh Matrix A, sh Matrix B, sh Matrix C );
extern sync void Sub( sh Matrix A, sh Matrix B, sh Matrix C );
extern sync void Mul( sh Matrix A, sh Matrix B, sh Matrix C );  // standard method
extern sync void StrassenMul( sh Matrix A, sh Matrix B, sh Matrix C );

// All sync routines in this program expect that
// the group's processors are consecutively numbered 0,1,...,p-1.


sync Matrix new_Matrix( sh int nr, sh int nc )
{
 sh Matrix M;
 sh int p = groupsize();
 sh float *buf;
 int i;
 
 seq {
   assert( nr > 0 );
   assert( nc > 0 );
   M = (Matrix)shmalloc( sizeof(matrix) );
   M->row = (float **)shmalloc( nr * sizeof(float *) );
   M->data = buf = (float *)shmalloc( nr * nc * sizeof(float) );
 }
 M->nrows = nr;
 M->ncols = nc;
 farm
   forall( i, 0, nr, p )
     M->row[i] = buf + i * nc;  
 return M;
} 


void printMatrix( Matrix M )
{
 int m = M->nrows;
 int n = M->ncols;
 int i,j;
 printf("Matrix %dX%d:\n", m, n );
 if (m > 8)  return;
 for (i=0; i<m; i++) {
    for (j=0; j<n; j++)
       printf(" %2.2f", Get( M,i,j));
    printf("\n");
 }
 printf("\n");
}


void freeMatrix( Matrix M )
{
 assert( M );
 shfree(M->data);
 shfree(M->row);
 shfree(M);
}


sync Matrix *splitMatrix( sh Matrix M, int x, int y ) 
{
 // create 4 quarter-submatrix views of M (no copy of data buffer)
 sh Matrix *sM;
 sh int p = groupsize();
 sh float *buf;
 sh int m, n;
 int i;
 
 seq {
   assert( M );
   assert( x > 0 );
   assert( y > 0 );
   m = M->nrows;
   n = M->ncols;
   assert( x < m );
   assert( y < n );
   //pprintf("splitMatrix( %dx%d -> %dx%d etc.\n", m, n, x, y );
   sM = (Matrix *)shmalloc( 4 * sizeof(Matrix) );
 }
 farm
   forall( i, 0, 4, p )
     sM[i] = (Matrix)shmalloc( sizeof(matrix) );
 seq {
   sM[0]->nrows = x;
   sM[0]->ncols = y;
   sM[0]->row = (float **)shmalloc( x * sizeof(float *) );
   // sM[0]->data = M->data;
   sM[1]->nrows = x;
   sM[1]->ncols = n-y;
   sM[1]->row = (float **)shmalloc( x * sizeof(float *) );
   // sM[1]->data = M->data;
   sM[2]->nrows = m-x;
   sM[2]->ncols = y;
   sM[2]->row = (float **)shmalloc( (m-x) * sizeof(float *) );
   // sM[2]->data = M->data;
   sM[3]->nrows = m-x;
   sM[3]->ncols = n-y;
   sM[3]->row = (float **)shmalloc( (m-x) * sizeof(float *) );
   // sM[3]->data = M->data;
 }
 farm {
   forall( i, 0, x, p ) {
      sM[0]->row[i] = M->row[i];
      sM[1]->row[i] = M->row[i] + y;
   }
   forall( i, 0, m-x, p ) {
      sM[2]->row[i] = M->row[i+x];
      sM[3]->row[i] = M->row[i+x] + y;
   }
 }
 return sM;
}



sync void Add( sh Matrix A, sh Matrix B, sh Matrix C )
{
 sh int p = groupsize();
 sh int n, m;
 int ij, i, j;

 farm {
   assert(A);
   assert(B);
   assert(C);
   n = A->nrows;
   m = A->ncols;
   assert( n == B->nrows );
   assert( n == C->nrows );
   assert( m == B->ncols );
   assert( m == C->ncols );
   Forall2( i, j, ij, 0, n, 0, m, p ) {
     //i = ij / n;
     //j = ij % n;
     Set( C, i,j, Get(A,i,j) + Get(B,i,j) );
   }
 }
}


sync void Sub( sh Matrix A, sh Matrix B, sh Matrix C )
{
 sh int p = groupsize();
 sh int n = A->nrows;
 sh int m = A->ncols;
 int ij, i, j;

 farm {
   assert( n == B->nrows );
   assert( m == B->ncols );
   Forall2( i, j, ij, 0, n, 0, m, p ) {
     //i = ij / n;
     //j = ij % n;
     Set( C, i,j, Get(A,i,j) - Get(B,i,j) );
   }
 }
}


sync void Mul( sh Matrix A, sh Matrix B, sh Matrix C )
{  // standard method
 sh int p = groupsize();
 sh int n, m, r;
 int ij, i, j, k;
 float s;
 seq {
   assert(A);
   assert(B);
   assert(C);
   n = A->nrows;
   m = B->ncols;
   r = A->ncols;
   assert( r == B->nrows );
   assert( n == C->nrows );
   assert( m == C->ncols );
 }
 farm {
   Forall2( i, j, ij, 0, n, 0, m, p ) {
   //forall( ij, 0, n*m, p ) {
     //i = ij / m;
     //j = ij % m;
     s = 0.0;
     for (k=0; k<r; k++)
        s += Get(A,i,k) * Get(B,k,j);
     Set( C, i,j, s );
   }
 }
}


// ------------------------------------------------------

// generic non-nestable map routine for unary functions:

sync void map( sync sh void (*f)(void *, void *), sh void **x,
               sh void **d, sh int N, sh int elsize )
{
  sh int p = 0;
  int i;

  $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1

  forall (i, 0, N, p )
     f( x + i*elsize, d + i*elsize );
}


// generic non-nestable map routine for binary functions with 1 scalar opnd:

sync void map1( sync sh void (*f)(void *, void *, void *),
                sh void **x, sh void **d, sh void **d2,
                sh int N, sh int elsize )
{
  sh int p = 0;
  int i;

  $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1

  forall (i, 0, N, p )
     f( x + i*elsize, d + i*elsize, d2 );
}


// generic non-nestable map routine for binary functions:

sync void map2( sync sh void (*f)(void *, void *, void *), 
                sh void **x, sh void **d, sh void **d2,
                sh int N, sh int elsize )
{
  sh int p = 0;
  int i;

  $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1

  forall (i, 0, N, p )
     f( x + i*elsize, d + i*elsize, d2 + i*elsize );
}


#define max(a,b) ((a)>=(b)?(a):(b))

// generic nestable map routine for unary functions:

sync void Map( sync sh void (*f)(sh void *, sh void *, sh int), 
               sh void **x, sh void **d, 
               sh int N, sh int mx, sh int md, sh int elsize )
{
  sh int m, t, p = 0;
  int i;
  farm m = max(mx,md);   // extent of inner calculation
  pprintf("call Map(%d,%d,%d)\n", N, m, elsize);

  $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1

  if (p < N)
    for ( t=0; t<N; t+=p )
      fork(p; @=$; $=0) 
        if (t+@ < N)
          f( x + (t+@)*m*elsize, d + (t+@)*m*elsize, m );
  else 
    fork( N; @=$%N; $=$/N )
      f( x + @*m*elsize, d + @*m*elsize, m );
}





// nestable map skeleton for binary functions
// Set md2 to zero if d2 denotes a scalar value.

sync void Map2( sync sh void (*f)(sh void *, sh void *, sh void *, sh int),
                sh void **x, sh void **d, sh void **d2, 
                sh int N, sh int mx, sh int md, sh int md2, sh int elsize )
{
  sh int m;
  sh int p = 0;
  $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
  farm m = max(mx,max(md,md2));   // extent of inner calculation
  pprintf("call Map2(N=%d,m=%d,elsize=%d) $=%d\n", N, m, elsize, $);
  if (p < N)
    fork(p; @=$; $=0) {
      sh int t = @;
      farm pprintf("vor for: P%d: &t=%x t=%d @=%d $=%d\n",
            __PROC_NR__, &t, t, @, $ );
      for (t=@; t<N; t+=p) {
        farm pprintf("P%d: t=%d N=%d @=%d $=%d\n",
                      __PROC_NR__, t, N, @, $ );
        f( x + t*mx*elsize, d + t*md*elsize, d2 + t*md2*elsize, m);
      }
      farm pprintf("nach for: P%d: &t=%x t=%d N=%d @=%d $=%d gs=%d\n",
            __PROC_NR__, &t, t, N, @, $, async_groupsize() );
    }
  else 
    fork( N; @=$%N; $=$/N )
      f( x + @*mx*elsize, d + @*md*elsize, d2 + @*md2*elsize, m);
}




sync void oldreduce( sync sh void (*f)(void *, void *, void *),
                  sh void *s, sh void **d, sh int n, sh int elsize )
{
 sh int p = 0; 
 sh void **temp;
 int i;
 sh int t;
 $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
 
 temp = (void **)shalloc( p * elsize );  // temporary array
 farm // initialize temp by the first p elements of d:
   if ($ < n)
     memcpy( temp+$*elsize, d+$*elsize, elsize );
 
 // p partial reductions being computed:
 forall( i, p, n, p ) 
   f( temp + $*elsize, temp + $*elsize, d + i*elsize );

 // iterative computation upwards the tree:
 for (t=1; t<p; t = t<<1 )  // sequential loop over tree levels
   if ($+t < p) 
      f( temp+$*elsize, temp+$*elsize, temp + ($+t)*elsize );
 seq
   memcpy( s, temp, elsize );    // write result to *s
 shallfree();  // release temp
}


/** In-place f-reduction for an array d of n<=2p items of size elsize.
 *  Executed by p processors numbered consecutively from 0 to p-1.
 */
sync void preduce( sync sh void (*f)(void *, void *, void *),
                   sh void **d, sh int n, sh int elsize )
{
 sh int t;
 //seq if (n <= 2*async_groupsize())
 //   printf("Error: n=%d p=%d\n", n, async_groupsize());
 // iterative computation upwards the tree:
 for (t=1; t<n; t = t<<1 )   // sequential loop over tree levels
   if (2*$*t+t < n) 
      f( d+(2*$*t)*elsize, d+(2*$*t)*elsize, d + (2*$*t+t)*elsize );
}


void seq_reduce( void (*f)(void *, void *, void *),
                 void *s, void **d, int n, int elsize )
{
 int t;
 if (n<=0) return;
 memcpy( s, d, elsize );    // initialize s by d[0]
 for (t=1; t<n; t++)
    f( s, s, d+t*elsize );
}


sync void reduce( sync void (*f)(void *, void *, void *),
                  sh void *s, sh void **d, sh int n, sh int elsize )
{
 sh int p = 0; 
 sh void **temp;
 int ss, myslice;
 sh int t;
 $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
 pprintf("call reduce(%d)\n", n );
 
 temp = (void **)shalloc( 2*p*elsize );  // temporary array

 if (2*p < n) {
   // partition the data vector d into n/p slices
   farm {
     ss = (int)((float)(n+p-1) / (float)(p));
     pprintf("ss=%d\n", ss);
     if ($*ss >= n)          myslice = 0;
     else if (($+1)*ss > n)  myslice = n - $*ss;
     else                    myslice = ss;
     pprintf("myslice=%d\n", myslice);
   }
   // and concurrently do sequential reduction for each slice:
   farm
     seq_reduce( f, temp+$*elsize, d+$*ss*elsize, myslice, elsize );
   n = p;  // extent of preduce
 }
 else 
   // copy <=2p data items in parallel to temp array:
   farm {
     if ($<n)
       memcpy( temp+$*elsize, d+$*elsize, elsize );
     if (p+$ < n)
       memcpy( temp+(p+$)*elsize, d+(p+$)*elsize, elsize );
   }

 preduce( f, temp, n, elsize );

 seq
   memcpy( s, temp, elsize );    // write result to *s
 shallfree();  // release temp
}



/** In-place iterative parallel f-prefix computation
 *  for a p-element array with p processors.
 *  If non--null, offset is f-combined with every element.
 *  Consecutive numbering of $ in 0..groupsize()-1 is assumed.
 */
sync void pprefix( sync sh void (*f)(void *, void *, void *),
                    sh void **x, sh void *offset, sh int elsize )
{
 sh int p = groupsize(); 
 sh int t;
  
 if (offset)
   if ($==0)
     f( x, offset, x );

 // iterative computation upwards the tree:
 for (t=1; t<p; t = t<<1 )  // sequential loop over tree levels
   if ($ >= t) 
      f( x+$*elsize, x+($-t)*elsize, x+$*elsize );

}

sync void prefix( sync sh void (*f)(void *, void *, void *),
                  sh void **x, sh void **d,
                  sh int n, sh int elsize )
{
 int i;
 sh int t;
 sh void *offset = NULL;
 sh int iter, p = 0; 
 $ = mpadd(&p,1); // assert consecutive numbering of $ in 0..p-1
  
 if ($ >= n) return;   // need at most n processors

 farm
   FORALL( i, &iter, 0, n, 1 )
     memcpy( x+i*elsize, d+i*elsize, elsize );    // x[i]=d[i]

 // outer sequential loop over x in slices of size p:
 for (t=0; t<n; t += p ) {
   if ($ < n-t)    // do not exceed bound of x for last slice
     pprefix( f, x+t*elsize, offset, elsize );
   offset = x + (t+p-1)*elsize; // offset for next slice prefix
 }
}


// example application for floats:

sync void fadd( void *c, void *a, void *b ) 
 { *(float *)c = *(float *)a + *(float *)b; }

sync void fsqr( void *c, void *a ) 
 { *(float *)c = *(float *)a * *(float *)a; }

struct st { int i; int j; } st1 = {1,2}, st2;


void fmul( void *c, void *a, void *b ) 
 { *(float *)c = *(float *)a * *(float *)b; }

sync void fdot( sh float *s, sh float *x, sh float *y, sh int n )
{
 sh float *tmp = (float *)shalloc( n * sizeof(float) );
 int i;
 pprintf("fdot(%d): %x=(%x,%x)\n", n, s, x, y );
 
 map2( fmul, (void **) tmp, (void **)x, (void **)y, n, sizeof(float) );
 //seq for(i=0;i<n;i++) *s += tmp[i]; // ((float *)x)[i] * ((float *)y)[i];
 reduce( fadd, s, (void **) tmp, n, sizeof(float) );
 shallfree();
}

int main(void)
{
  start {
    sh float s, *x, *c;
    sh int n = 5, m = 7;
    sh Matrix A;
    int i, j;
   
    x = (float *)shalloc( m * sizeof(float) );
    c = (float *)shalloc( n * sizeof(float) );
    farm
       forall( i, 0, m, __STARTED_PROCS__) 
          x[i] = (float) i;     // some initialization
    seq for(i=0; i<m; i++) printf("x[%d] = %f\n", i, x[i] );
    A = new_Matrix( n, m );
    seq
       for (i=0; i<n; i++)
          for (j=0; j<m; j++)
             Set( A, i, j, (float)(i+j) );
    seq printMatrix( A );
    initTracing( 100000 );
    startTracing();
    Map2( (void(*)(void*,void*,void*,int))fdot, 
          (void **)c, (void **)(A->data), (void *)x,
          n, 1, m, 0, sizeof(float) );
    stopTracing();
    //prefix( fadd, (void **) c, (void **)x, n, sizeof(float));
    //seq for(i=0; i<n; i++) printf("prefix[%d] = %f\n", i, c[i] );
    //map( fsqr, (void **) c, (void **)x, n, sizeof(float) );
    seq for(i=0; i<n; i++) printf("c[%d] = %f\n", i, c[i] );
    seq printf(" %f\n", s );
    writeTraceFile("Map2", "nested_parallelism_Map2(map;reduce)");
  }
  barrier;
}

