/*  BPL.c    CWK 941127    Basic PARAMAT Pattern Library FORK Implementation*/

#include <fork.h>
#include <syscall.h>
#include <io.h>

#include "bpl.h"

sh int m =20;
sh int n =20;
 /* Programm stuerzt aus unbekannten Gruenden ab fuer m >= n. */
 int i;
 int j;
 int itm;
 int ire = 4;
 int im;
sh float s;
sh float rm;
sh float beta;


main() { start
  {
    sh matrix a = (matrix) shalloc(m);
    sh vector r = (vector) shalloc(n);
    sh vector x = (vector) shalloc(n);
    sh vector y = (vector) shalloc(m);
    sh vector b = (vector) shalloc(m);
    sh vector d = (vector) shalloc(n);
    for (i=$; i<m; i+= __STARTED_PROCS__) 
      farm a[i] = (vector) malloc(n); 
    for (i=$; i<m; i+= __STARTED_PROCS__) 
      farm for (j=0; j<m; j++)  a[i][j] = 0.000001;
    for (i=$; i<n; i+= __STARTED_PROCS__) 
      farm x[i] = (float) 1.0; 
 _MV (__STARTED_PROCS__,m,n,y,a,x,0.000000,0,0);
 _VAADD (__STARTED_PROCS__,m,y,b);
 _MV (__STARTED_PROCS__,n,m,r,a,y,0.000000,0,0/*TRANSP*/);
 _VCOPY (__STARTED_PROCS__,n,d,r);
 _VQSUM (__STARTED_PROCS__,n,&rm,r,0.000000);
  for (im=1; im<=ire; im++)
  {
 _MV (__STARTED_PROCS__,m,n,y,a,d,0.000000,0,0);
 _VQSUM (__STARTED_PROCS__,m,&s,y,0.000000);
 {sh float _temp243 = /*(float)((int)rm/(int)s) */ 1.0;
  _VAADDSV (__STARTED_PROCS__,n,x,_temp243,d);
 }
 _MV (__STARTED_PROCS__,m,n,y,a,x,0.000000,0,0);
 _VAADD (__STARTED_PROCS__,m,y,b);
 _MV (__STARTED_PROCS__,n,m,r,a,y,0.000000,0,0/*TRANSP*/);
 beta=0.0/*1.0/(rm)*/;
 _VQSUM (__STARTED_PROCS__,n,&rm,r,0.000000);
 beta=rm*beta + 1.0;
 _VADDSV (__STARTED_PROCS__,n,d,beta,d,r);
 /*_VWRITE( __STARTED_PROCS__, n, d );*/
  }
 }
}


sync void _MV (   /* short version of MV(2) matrix-vector product */
  sh int p,       /* #processors executing this call (synchronous!) */
  sh int n,       /* length of lhs vector array access (problem size) */
  sh int m,       /* length of rhs vector array access (problem size) */
  sh vector x,    /* return value */
  sh matrix A,    /* first operand vector, short form */
  sh vector b,    /* second operand vector, short form */
  sh float init,  /* the initialization value, if not a vector */
  sh vector initvec, /* the initialization vector, if not a scalar */
  sh int Atransp) /* 1 if A transposed, 0 otherwise */
{
  pr int i=0, j=0;                                   /* private loop index */
  pr int slice = n/p + 1;                            /* local problem size */
  pr vector tmp;                                /*temp. accumulator vector */
  pr int lbi = $ * slice;                    /* lower bound of local slice */
  pr int ubi = lbi + slice;                  /* upper bound of local slice */
  farm {
    tmp = (vector) malloc(slice);
    if (ubi >= n) ubi = n;
    if (initvec)
      for (i=lbi; i<ubi; i++) tmp[i-lbi] = initvec[i]; /*preset local slice*/
    else 
      for (i=0; i<slice; i++) tmp[i] = init;
  }
  if (lbi < n) {
   if (!Atransp)
    farm for (j=0; j<m; j++)       /* summing loop is fully sequentialized */
           for (i=0; i<slice; i++)         /* innermost: saxpy computation */
             tmp[i] = tmp[i] + A[i+lbi][j] * b[j];
   else
    farm for (j=0; j<m; j++)       /* summing loop is fully sequentialized */
           for (i=0; i<slice; i++)
             tmp[i] = tmp[i] + A[j][i+lbi] * b[j];
  }
  for (i=lbi; i<ubi; i++) x[i] = tmp[i-lbi];
}
 

sync void _VAADD (  /* short version of VAADD(1) self-vector addition */
  sh int p,         /* #processors executing this call (synchronous!) */
  sh int n,         /* length of arrays _and_ array accesses (problem size) */
  sh vector a,      /* target and additive operand vector, short form */
  sh vector v)      /* operand vector, short form.     a[] += v[]    */
{
  pr int i=0;                                      /* private loop index */
  farm 
    for (i=$; i<n; i+=p)       /*min(n,p) processors operate in parallel */
      a[i] += v[i];         /* each PE computes up to n/p elements */
}


sync void _SV (     /* short version of SV(1) scalar-vector multiplication */
  sh int p,         /* #processors executing this call (synchronous!) */
  sh int n,         /* length of arrays _and_ array accesses (problem size) */
  sh vector a,      /* target and additive operand vector, short form */
  sh float s,       /* scalar factor for v */
  sh vector v)      /* operand vector, short form.     a[] = s*v[]    */
{
  pr int i=0;                                      /* private loop index */
  farm 
    for (i=$; i<n; i+=p)       /*min(n,p) processors operate in parallel */
      a[i] = s * v[i];         /* each PE computes up to n/p elements */
}


sync void _VAADDSV (/* short version of VAADDSV(1) acc. scalar-vector triad */
  sh int p,         /* #processors executing this call (synchronous!) */
  sh int n,         /* length of arrays _and_ array accesses (problem size) */
  sh vector a,      /* target and additive operand vector, short form */
  sh float s,       /* scalar factor for v */
  sh vector v)      /* operand vector, short form.     a[] += s*v[]    */
{
  pr int i=0;                                      /* private loop index */
  farm 
    for (i=$; i<n; i+=p)       /*min(n,p) processors operate in parallel */
      a[i] = s * v[i];            /* each PE computes up to n/p elements */
}


sync void _VADDSV (/* short version of VADDSV(1) nonacc. scalar-vector triad */
  sh int p,        /* #processors executing this call (synchronous!) */
  sh int n,        /* length of arrays _and_ array accesses (problem size) */
  sh vector a,     /* target vector, short form */
  sh float s,      /* scalar factor for v */
  sh vector v,     /* operand vector, short form.     a[] = s*v[] + y[]    */
  sh vector y )    /* additive operand vector, short form */
{
  pr int i=0;                                      /* private loop index */
  farm 
    for (i=$; i<n; i+=p)       /*min(n,p) processors operate in parallel */
      a[i] = s * v[i] + y[i];     /* each PE computes up to n/p elements */
}


sync void _VCOPY (/* short version of VCOPY(1) vector copy */
  sh int p,       /* #processors executing this call (synchronous!) */
  sh int n,       /* length of arrays _and_ array accesses (problem size) */
  sh vector a,    /* target vector, short form */
  sh vector b )   /* source vector, short form */
{
  pr int i=0;                                      /* private loop index */
  farm 
    for (i=$; i<n; i+=p)       /*min(n,p) processors operate in parallel */
      a[i] = b[i];                  /* each PE copies up to n/p elements */
}


sync void _SSP (  /* short version of SSP(1) standard dot product */
  sh int p,       /* #processors executing this call (synchronous!) */
  sh int n,       /* length of arrays _and_ array accesses (problem size) */
  sh float *s,    /* scalar return value */
  sh vector a,    /* first operand vector, short form */
  sh vector b,    /* second operand vector, short form */
  sh float init ) /* the initialization value */
{
  pr int i=0;                                      /* private loop index */
  pr float res = 0.0;                             /* private accumulator */
  sh float *tmp = (float *)shalloc(p);   /* temporary accumulator vector */
  farm {
    for (i=$; i<n; i+=p)
      res += a[i] * b[i];            /* each PE accumulates partial sums */
    if ($<n) tmp[$] = res;                 /* concurrent preset of tmp[] */
    else     tmp[$] = 0.0;
  }
  for (i=1; i<p; i*=2)
    farm
      if ($+i<p)
        tmp[$] = tmp[$] + tmp[$+i];      /* tree-like reduction of tmp[] */
  *s = tmp[0] + init;               /* use initialization value supplied */
}

sync void _VQSUM (/* short version of VQSUM(1) square dot product */
  sh int p,       /* #processors executing this call (synchronous!) */
  sh int n,       /* length of arrays _and_ array accesses (problem size) */
  sh float *s,    /* scalar return value */
  sh vector a,    /* the operand vector, short form */
  sh float init ) /* the initialization value */
{
  pr int i=0;                                      /* private loop index */
  pr float res = 0.0;                             /* private accumulator */
  sh float *tmp = (float *)shalloc(p);   /* temporary accumulator vector */
  farm {
    for (i=$; i<n; i+=p)
      res += a[i] * a[i];            /* each PE accumulates partial sums */
    if ($<n) tmp[$] = res;                 /* concurrent preset of tmp[] */
    else     tmp[$] = 0.0;
  }
  for (i=1; i<p; i*=2)
    farm
      if ($+i<p)
        tmp[$] = tmp[$] + tmp[$+i];      /* tree-like reduction of tmp[] */
  *s = tmp[0] + init;               /* use initialization value supplied */
}

sync void _VWRITE( /* print first n elements of (short v.) vector v in par*/
  sh int p,        /* #processors executing this call, 0 <= $ < p    */
  sh int n,        /* vector size */
  sh vector v )    /* vector to be printed */
{
  pr int i, j;
  for (i=$; i<n; i+= p) 
     farm {
       for (j=0;j<$;j++);                             /* delay loop */
       prF( v[i] );                            /* print ith element */
     }
}

