// parallel divide-and-conquer skeleton,
// applied to recursive parallel floatingpoint sum computation

#include <fork.h>
#include <io.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <math.h>


sync void divide_conquer(
  sh void ***presult, // address of result array
  sh void **data,   // operand array
  sh int n,         // #elements in operand array
  sh int elsize,    // element size in operand array
  sh void (*solveseq)( void ***pres, void **dat, int nn, int elsiz ),
  sh sync int (*istrivial)( sh int nn ),
  sh void (*solvetrivial)( void ***pres, void **dat, int nn, int elsiz ),
  sh sync void (*divide)( sh int *pk, sh void ***psubprob, sh int **psubsiz, 
                          sh void **dat, sh int nn, sh int elsiz ),
  sh sync void (*conquer)( sh int kk, sh void ***pres, sh void ***subsol,
                           sh int *ss, sh int nn, sh int elsiz )
 )
{
  sh int p = groupsize();
  sh int k;
  sh int istr;
  sh int *subsize;
  sh void **subproblem, ***subsolution;
  int i;

  istr = istrivial( n );
  if (istr) {
     farm pprintf("solvetrivial %d\n", n);
     seq solvetrivial( presult, data, n, elsize );
     return;
  }

  if (p==1) { 
     farm pprintf("solveseq %d\n", n);
     farm solveseq( presult, data, n, elsize );
     return;
  }
  
  // divide into k subproblems of suitable size,
  divide( &k, &subproblem, &subsize, data, n, elsize );
  farm assert( k > 1 );
  if (k > p) {   // not enough processors, solve sequentially:
     farm pprintf("must solve seq %d\n", n);
     seq solveseq( presult, data, n, elsize );
     return;
  }

  // solve subproblems recursively in parallel,
  // store result in data array:
  subsolution = (void ***)shalloc( k * sizeof(void **));
  fork( k; @=$%k; ) {
     sh int pp = 0;
     $ = mpadd( &pp, 1 );
     divide_conquer( &(subsolution[@]),
                     subproblem[@], subsize[@], elsize,
                     solveseq, istrivial, solvetrivial, 
                     divide, conquer );
  }
  
  // now do the conquer operation to compute the result,
  // allocate *presult and store the result therein:
  conquer( k, presult, subsolution, subsize, n, elsize );

  shallfree();
}


// ----------------------- problem-specific routines: ------

sync int issmall( sh int N )
{
  if ( N <= 6 )  return 1;
  else           return 0;
}


sync void split2( sh int *pk, sh void ***psubproblem, sh int **psubsize,
                  sh void **data, sh int n, sh int elsize )
{
  sh int nd2 = n / 2;
  *pk = 2;
  *psubproblem = (void **) shalloc( 2 * sizeof(void *) );
  *psubsize = (int *) shalloc( 2 * sizeof(int) );
  (*psubsize)[0] = nd2;
  (*psubsize)[1] = n - nd2;
  (*psubproblem)[0] = data;
  (*psubproblem)[1] = data + nd2 * elsize;
}


void seqfloatsum( void ***presult, void **data, int n, int elsize )
{
  int i;
  float sum = 0.0;
  *presult = (void **)shmalloc( sizeof(float) );
  for (i=0; i<n; i++)
     sum += ((float *)data)[i];
  *(float *)presult = sum;
}


/** conquer-step for summing:  add two partial sums
 */
sync void addFloat( sh int k, sh void ***pdest, sh void ***data,
                  sh int *subsize, sh int n, sh int elsize )
{
 sh float s1 = ((float *)data)[0];
 sh float s2 = ((float *)data)[1];
 seq {
  *pdest = (void **)shmalloc( sizeof(float) );
  *(float *)pdest = s1 + s2;
 }
}


void main( void )    // Mergesort with divide_conquer skeleton
{
  start {
    sh int N = 27;
    sh float *a = (float *)shalloc( N * sizeof(float));
    sh float *psum;
    sh int p = groupsize();
    int i;

    forall(i,0,N,p)  a[i] = (float)(i+1);   // init
    seq {
      printf("Array: ");
      for (i=0; i<N; i++) printf(" %f", a[i]);
      printf("\n");
    } 
    divide_conquer( (void ***)psum, (void **)a, N, sizeof(float),
                    seqfloatsum, issmall, seqfloatsum, split2, addFloat );
    seq
      printf("Sum: %f\n", *psum );
  }
}
