/* Fast-Fourier-Transform in Fork95.   C.W.Kessler 08/96 */

#include <fork.h>
#include <io.h>
#include <math.h>
#include <stdlib.h>

double one = 1.0;    /* workaround; local constants crash */
double zero = 0.0;   /* workaround; local constants crash */
          
extern void shfree( void * );

typedef struct {
  double re, im; 
} complex_number, *cplx;

/* The following routines for complex arithmetics
 * will later be available in math.h as assembler routines:
 */

async cplx cnum(pr float a, pr float b) 
{
  cplx c = (cplx) shmalloc( sizeof( complex_number ));
  c->re = a;
  c->im = b;
  return c;
}

#define freecplx(x) shfree(x)

#define shfreecplxarray(a,n) \
{ \
   int k; \
   /* for (k=0; k<n; k++)  shfree(a[k]); */ \
   shfree(a); \
}

async cplx cadd(pr cplx a, pr cplx b)
{
  return cnum( a->re+b->re, a->im+b->im );
}

async cplx csub(pr cplx a, pr cplx b)
{
  return cnum( a->re-b->re, a->im-b->im );
}

async cplx cmul(pr cplx a, pr cplx b)
{
  return cnum( a->re*b->re - a->im*b->im,
               a->im*b->re + a->re*b->im );
}

sync cplx *seq_fft( sh cplx *a, sh int n, sh cplx *w )
{
  /* seq. recursive FFT implementation for complex array a[].
   * Assume n is a power of 2. Operand array a is not corrupted.
   */
 sh cplx *ft;                            /*result array*/
 sh cplx *even, *odd, *fteven, *ftodd;   /*temporary pointers*/
 sh int ndiv2;
 pr int i;

 if (n==1)  {
    seq { ft = shmalloc(1);
          ft[0] = cnum( a[0]->re, a[0]->im );  }
    return ft;
 }

 /* the general case: compute temporary arrays even[], odd[]
  *                   and in parallel call fft for each of them
  */
 seq  ft = (cplx *) shmalloc( n );  /* allocate result array */
 ndiv2 = n>>1;
 even = (cplx *) shalloc( ndiv2 );  /* there is more space on the */
 odd  = (cplx *) shalloc( ndiv2 );  /* shared heap than on private */
 for( i=0; i<ndiv2; i++ ) {
   even[i] = a[2*i];             /* copy pointer to same number */
   odd[i]  = a[2*i+1];
 }
 fteven = seq_fft( even, ndiv2, w );
 ftodd = seq_fft( odd, ndiv2, w );

 farm 
  for( i=0; i<ndiv2; i++ ) {   /*sequential loop*/
    pr cplx t = cmul( w[i], ftodd[i] ); 
    ft[i]        = cadd( fteven[i], t );
    ft[i+ndiv2]  = csub( fteven[i], t );
    freecplx( t );
  }                            /*space and calls to shmalloc/shfree  */
 seq  shfreecplxarray( fteven, ndiv2 );
 seq  shfreecplxarray( ftodd, ndiv2 );
 shallfree();   /* free even, odd */
 return ft;
}

sync cplx *fft( sh cplx *a, sh int n, sh cplx *w )
{
  /* recursive FFT implementation for complex array a[].
   * complex array w[] contains the powers of the primitive
   * n'th root w of unity: { 1, w, w^2, w^3, ..., w^(n-1) }.
   * Assume n is a power of 2.
   * Operand array a is not corrupted.
   */
 sh cplx *ft;                            /*result array*/
 sh cplx *even, *odd, *fteven, *ftodd;   /*temporary pointers*/
 sh int p = 0;
 sh int ndiv2;
 pr int i;

 $ = mpadd( &p, 1 );  /* ensure consecutive numbering of procs */

 seq  prS("fft\n");   /*diagnosis output*/

 if (n==1)  {
    seq { ft = shmalloc(1);
          ft[0] = cnum( a[0]->re, a[0]->im );  }
    return ft;
 }
 if (p==1)  return seq_fft( a, n, w );

 /* general case: compute temporary arrays even[], odd[]    *
  *               and in parallel call fft for each of them */
 seq  ft = (cplx *) shmalloc( n );  /* allocate result array */
 ndiv2 = n>>1;
 even = (cplx *) shalloc( ndiv2 );
 odd  = (cplx *) shalloc( ndiv2 );
 for( i=$; i<ndiv2; i+=p ) {     /*dataparallel loop*/
   even[i] = a[2*i];             /* copy pointer to same number */
   odd[i]  = a[2*i+1];
 }
 if ($<p/2)  /*split current group into two equally sized subgroups:*/
      fteven = fft( even, ndiv2, w );
 else                                 ftodd = fft( odd, ndiv2, w );

 farm
  for( i=$; i<ndiv2; i+=p ) {     /*dataparallel loop*/
    pr cplx t = cmul( w[i], ftodd[i] ); 
    ft[i]        = cadd( fteven[i], t );
    ft[i+ndiv2]  = csub( fteven[i], t );
    freecplx( t );
  }
 seq  shfreecplxarray( fteven, ndiv2 );
 seq  shfreecplxarray( ftodd, ndiv2 );
 shallfree();  /* free even, odd */
 return ft;
}

sync cplx *FFT( sh cplx *a, sh int n, sh cplx w )
{
 sh cplx *ft;
 sh cplx *pow = (cplx *) shalloc( n );
 sh cplx wtoi, newwtoi;
 sh int p = groupsize();
 sh int twotoi, j;
 /* compute vector pow[] of powers of w: { 1, w, w^2, ..., w^(n-1) } */
 seq {
    wtoi = cnum( w->re, w->im );
    pow[0] = cnum(1.0,0.0); pow[1] = cnum( w->re, w->im ); 
 }
 for (twotoi=2; twotoi<n; twotoi*=2) {
   seq wtoi = cmul( wtoi, wtoi ); 
   farm
     gforall (j, 0, twotoi, p)
       pow[twotoi+j] = cmul( wtoi, pow[j] );
 }
 ft = fft( a, n, pow );   /* call recursive fft routine */
 shallfree();
 return ft;
}


main()
{
 start {
#ifdef LARGE_N
  sh int n=4096;
#else
  sh int n=16;
#endif
  sh cplx *a;
  sh cplx w;
  sh cplx *ft;
  sh unsigned int starttime, stoptime;
  sh double t = 2*PI/(double)n;
  pr int i;

  seq {
     a = (cplx *) shmalloc( n );
     w = cnum( cos(t), sin(t) );
     for (i=0; i<n; i++)
        a[i] = cnum( cos((double)i), sin((double)i));
#ifndef LARGE_N
     a[0] = cnum(-5,0);     a[1] = cnum(-1.55,0);   /*example data */
     a[2] = cnum(-0.939,0); a[3] = cnum( 1.6, 0);   /*see [Quinn93]*/
     a[4] = cnum( 3,0);     a[5] = cnum( 3.51,0);
     a[6] = cnum(3.77,0);   a[7] = cnum( 1.66,0);
     a[8] = cnum(-1.5,0);   a[9] = cnum(-2.7,0);
     a[10] = cnum(-3.06,0); a[11] = cnum(-3.02,0);
     a[12] = cnum(-1.00,0); a[13] = cnum(0.736,0);
     a[14] = cnum(0.232,0); a[15] = cnum(-0.25,0);
#endif
     for (i=0; i<n; i++)
        printf("%5.2f+i*%5.2f ", a[i]->re, a[i]->im);
     printf("\nCompute FFT:\n");
  }
  farm starttime = getct();
  ft = FFT( a, n, w );
  farm stoptime = getct();
  seq {
     for (i=0; i<n; i++) 
        printf("%5.2f+i*%5.2f ", ft[i]->re, ft[i]->im);
  }
  seq printf("\nTime: %u PRAM CPU cycles\n", stoptime - starttime);
 }
 barrier;
 exit(0);
}
