// parallel divide-and-conquer skeleton,
// applied to recursive parallel floatingpoint mergesort

#include <fork.h>
#include <io.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <math.h>


sync void divide_conquer(
  sh void ***presult, // address of result array
  sh void **data,   // operand array
  sh int n,         // #elements in operand array
  sh int elsize,    // element size in operand array
  sh void (*solveseq)( void ***pres, void **dat, int nn, int elsiz ),
  sh sync int (*istrivial)( sh int nn ),
  sh void (*solvetrivial)( void ***pres, void **dat, int nn, int elsiz ),
  sh sync void (*divide)( sh int *pk, sh void ***psubprob, sh int **psubsiz, 
                          sh void **dat, sh int nn, sh int elsiz ),
  sh sync void (*conquer)( sh int kk, sh void ***pres, sh void ***subsol,
                           sh int *ss, sh int nn, sh int elsiz )
 )
{
  sh int p = groupsize();
  sh int k;
  sh int istr;
  sh int *subsize;
  sh void **subproblem, ***subsolution;
  int i;

  if (p==1) { 
     farm solveseq( presult, data, n, elsize );
     return;
  }

  istr = istrivial( n );
  if (istr) {
     seq solvetrivial( presult, data, n, elsize );
     return;
  }
  
  // divide into k subproblems of suitable size,
  divide( &k, &subproblem, &subsize, data, n, elsize );
  farm assert( k > 1 );
  if (k > p) {   // not enough processors, solve sequentially:
     seq solveseq( presult, data, n, elsize );
     return;
  }

  // solve subproblems recursively in parallel,
  // store result in data array:
  subsolution = (void ***)shalloc( k * sizeof(void **));
  fork( k; @=$%k; ) {
     sh int pp = 0;
     $ = mpadd( &pp, 1 );
     divide_conquer( &(subsolution[@]),
                     subproblem[@], subsize[@], elsize,
                     solveseq, istrivial, solvetrivial, 
                     divide, conquer );
  }

  // now do the conquer operation to compute the result,
  // allocate *presult and store the result therein:
  conquer( k, presult, subsolution, subsize, n, elsize );

  shallfree();
}


// -------------- the problem-specific routines: ----------

sync int issmall( sh int N )
{
  if ( N <= 5 )  return 1;
  else           return 0;
}


sync void split2( sh int *pk, sh void ***psubproblem, sh int **psubsize,
                  sh void **data, sh int n, sh int elsize )
{
  sh int nd2 = n / 2;
  *pk = 2;
  *psubproblem = (void **) shalloc( 2 * sizeof(void *) );
  *psubsize = (int *) shalloc( 2 * sizeof(int) );
  (*psubsize)[0] = nd2;
  (*psubsize)[1] = n - nd2;
  (*psubproblem)[0] = data;
  (*psubproblem)[1] = data + nd2 * elsize;
}


int compare( void *a, void *b )
{
  if ( *((float *)a) < *((float *)b) )  return -1;
  else
  if ( *((float *)a) > *((float *)b) )  return 1;
  else                                  return 0;
}


void seqsort( void ***presult, void **data, int n, int elsize )
{
  *presult = (void **)shmalloc( n * elsize );
  memcpy( *presult, data, n*elsize );
  qsort( *presult, n, elsize, compare );
}


/* in sequential compute the rank of key within
 * array of size n, i.e. # array-elements < key
 * implementation assumes that all elements are mutually different
 */
int getRankFloat( float key, float *array, int n )
{
 int left = 0;
 int right = n-1;
 int mid;
 if (key >= array[n-1]) return n;
 if (key == array[n-1]) return n-1;
 if (key <= array[0]) return 0;
 while (left < right-1) {   // binary search:
    // always maintain array[left] <= key < array[right]
    mid = (right+left)/2;
    if (key < array[mid]) right = mid;
    else                  left = mid;
 }
 if (key==array[left]) return left;
 else                  return left+1;
}
 


/* merge array src1 of size n1 and src2 of size n2
 * into one array *pdest of size n1+n2. 
 * Assertions: p>1, n1*n2>=1. dest array is not yet allocated.
 * Implementation assumes that all elements are mutually different
 */
sync void mergeFloat( sh int k, sh void ***pdest, sh void ***data,
                  sh int *subsize, sh int n, sh int elsize )
{
 sh float *src1 = ((float **)data)[0];
 sh float *src2 = ((float **)data)[1];
 sh int n1 = subsize[0];
 sh int n2 = subsize[1];
 sh int p = groupsize();
 sh int iter;
 sh int *rank12, *rank21;  // temporary rank arrays
 pr int i;
 farm assert(p>1);
 farm assert(n1+n2 == n);
 rank12 = (int *)shalloc( n1 * sizeof(int) );
 rank21 = (int *)shalloc( n2 * sizeof(int) );
 seq *pdest = (void **)shmalloc( n * sizeof(float) );
 farm
   FORALL( i, &iter, 0, n1, 1 )
     rank12[i] = getRankFloat( src1[i], src2, n2 ); 
 farm
   FORALL( i, &iter, 0, n2, 1 )
     rank21[i] = getRankFloat( src2[i], src1, n1 ); 
 farm {
   /* copy elements to *pdest using the rank information */
   forall( i, 0, n1, p )  
      (*(float **)pdest)[i+rank12[i]] = src1[i];
   forall( i, 0, n2, p)
      (*(float **)pdest)[i+rank21[i]] = src2[i];
 }
 shallfree();
}


void main( void )    // Mergesort with divide_conquer skeleton
{
  start {
    sh int N = 47;
    sh float *a = (float *)shalloc( N * sizeof(float));
    sh float **psorted;
    sh int p = groupsize();
    int i;

    farm forall(i,0,N,p)  a[i] = (float)(N*sin(i)*sin(i));   // init
    seq {
      printf("Unsorted: ");
      for (i=0; i<N; i++) printf(" %f", a[i]);
      printf("\n");
    } 
    if ($<N)
    divide_conquer( (void ***)psorted, (void **)a, N, sizeof(float),
                    seqsort, issmall, seqsort, split2, mergeFloat );
    seq {
      printf("Sorted: ");
      for (i=0; i<N; i++) printf(" %f", (*(float **)psorted)[i]);
      printf("\n");
    } 
  }
}
