/*
 * lu.c
 * kirk johnson
 * october 1994
 *
 * Copyright (C) 1995 Massachusetts Institute of Technology
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation for any purpose is hereby granted without
 * fee, provided that the above copyright notice appear in all copies
 * and that both that copyright notice and this permission notice
 * appear in supporting documentation. The author makes no
 * representations about the suitability of this software for any
 * purpose. It is provided "as is" without express or implied
 * warranty.
 *
 * THE AUTHORS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
 * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR
 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * RCS $Id: lu.c,v 1.9 1995/08/22 21:27:49 tuna Exp $
 */

#define USE_CRL
/*#define STATISTICS*/
/*#define LU_SANITY*/
/*#define CHECK_RESULT*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

/* alewife cpp seems to be leaving "unix" #define-d, so #undef it
 */
#if defined(ALEWIFE) && defined(unix)
#undef unix
#endif

#if defined(CM5)
#include <cm/cmmd.h>
#define assert(x)                                  \
 do {                                              \
  if (!(x))                                        \
    CMMD_error("pn %d: failed assertion, %s:%d\n", \
	       crl_self_addr, __FILE__, __LINE__); \
 } while (0)
#elif defined(ALEWIFE)
#include <parallel.h>
#include <assert.h>
#else
#include <sys/time.h>
#include <assert.h>
#endif

#if defined(USE_CRL)
#include "crl.h"
#endif

#if defined(LU_SANITY)
#define sanity(x) assert(x)
#else
#define sanity(x)
#endif

void     command_line(int, char **);
void     lu_worker(int, int, int);
#if defined(USE_CRL)
void     lu_matrix(rid_t);
#else
void     lu_matrix(double **);
#endif
void     update_row_block(double *, double *);
void     update_col_block(double *, double *);
void     update_int_block(double *, double *, double *);
void     lu_block(double *);
#if defined(USE_CRL)
rid_t    compute_lu_product(rid_t);
#else
double **compute_lu_product(double **);
#endif
void     lu_prod_both_complete(double *, double *, double *);
void     lu_prod_complete_upper(double *, double *, double *);
void     lu_prod_lower_complete(double *, double *, double *);
void     lu_prod_lower_upper(double *, double *, double *);
#if defined(USE_CRL)
rid_t    zero_matrix(void);
rid_t    gen_matrix(void);
rid_t    copy_matrix(rid_t);
rid_t    alloc_matrix(void);
void     compare_matrices(rid_t, rid_t);
void     compare_blocks(rid_t, rid_t, double *, double *);
#else
double **zero_matrix(void);
double **gen_matrix(void);
double **copy_matrix(double **);
double **alloc_matrix(void);
void     compare_matrices(double **, double **);
void     compare_blocks(double *, double *, double *, double *);
#endif
unsigned fast_random(unsigned);
void     timer_clear_and_start(void);
void     timer_stop_and_print(void);

#if !defined(USE_CRL)
extern void   shm_sync_seq_init(void);
extern void   shm_sync_par_init(void);
extern void   shm_barrier(void);
extern void   shm_bcast_send_p(void *);
extern void  *shm_bcast_recv_p(void);
extern double shm_reduce_dmin(double);
extern double shm_reduce_dmax(double);
extern double shm_reduce_dadd(double);
#endif

int _nprocs;
int _matsiz;
int _blksiz;

int _eltspermat;
int _matsizblks;
int _blkspermat;
int _eltsperblk;

int _blkblkwdth;	/* W and H of block-of-blocks */
int _blkblkhght;
int _blkblkrow;		/* my position in block-of-blocks */
int _blkblkcol;


#if defined(TCPUNIX)
extern char *GROUP;
int main2(int argc, char **argv)
#else
int main(int argc, char **argv)
#endif
{
#if defined(CM5)
  CMMD_set_io_mode(0, CMMD_independent);
  CMMD_set_io_mode(1, CMMD_independent);
  CMMD_set_io_mode(2, CMMD_independent);
#endif

#if !defined(USE_CRL)
  shm_sync_seq_init();
#endif

  command_line(argc, argv);

#if defined(ALEWIFE)
  do_in_parallel(lu_worker, _nprocs, _matsiz, _blksiz);
#else
  lu_worker(_nprocs, _matsiz, _blksiz);
#endif

  return 0;
}


void command_line(int argc, char **argv)
{
  if (argc != 4)
  {
#if defined(CM5)
    if (CMMD_self_address() == 0)
      fprintf(stderr, "usage: %s <nprocs> <matsiz> <blksiz>\n", argv[0]);
#else
    fprintf(stderr, "usage: %s <nprocs> <matsiz> <blksiz>\n", argv[0]);
#endif
    exit(1);
  }

  sscanf(argv[1], "%d", &_nprocs);
  sscanf(argv[2], "%d", &_matsiz);
  sscanf(argv[3], "%d", &_blksiz);

#if defined(CM5)
  assert(_nprocs > 0);
  assert(_nprocs <= CMMD_partition_size());
  CMMD_reset_partition_size(_nprocs);
#endif

  assert(_matsiz > 0);
  assert(_blksiz > 0);
  assert((_matsiz % _blksiz) == 0);
}


void lu_worker(int nprocs_arg, int matsiz_arg, int blksiz_arg)
{
  int      self;
  int      nprocs;
#if defined(USE_CRL)
  rid_t    a;
#else
  double **a;
#endif
#if defined(CHECK_RESULT)
#if defined(USE_CRL)
  rid_t    b;
  rid_t    c;
#else
  double **b;
  double **c;
#endif
#endif
  double   t;

#if !defined(USE_CRL)
  shm_sync_par_init();
#endif

  _matsiz = matsiz_arg;
  _blksiz = blksiz_arg;

#if defined(USE_CRL)
#if defined(TCPUNIX)
  crl_init(GROUP);
#else
  crl_init();
#endif
  assert(nprocs_arg == crl_num_nodes);
#endif

#if defined(USE_CRL)
  self   = crl_self_addr;
  nprocs = crl_num_nodes;
#else
  self   = MY_PID;
  nprocs = NPROCS;
#endif

  _eltspermat = _matsiz * _matsiz;
  _matsizblks = _matsiz / _blksiz;
  _blkspermat = _matsizblks * _matsizblks;
  _eltsperblk = _blksiz * _blksiz;

  _blkblkwdth = 1;
  _blkblkhght = nprocs;
  while ((_blkblkwdth * 2) < _blkblkhght)
  {
    _blkblkwdth *= 2;
    _blkblkhght /= 2;
  }
  assert((_blkblkwdth * _blkblkhght) == nprocs);

  _blkblkrow = self / _blkblkwdth;
  _blkblkcol = self % _blkblkwdth;

  if (self == 0)
  {
    printf("lu, matsiz = %d, blksiz = %d\n", _matsiz, _blksiz);
    fflush(stdout);
  }

  a = gen_matrix();
#if defined(CHECK_RESULT)
  b = copy_matrix(a);
#endif

#if defined(USE_CRL) && defined(STATISTICS)
  if (crl_self_addr == 0)
    printf("\n[reseting statistics]\n\n");
  crl_stats_reset();
#endif

  timer_clear_and_start();
  lu_matrix(a);
  timer_stop_and_print();

#if defined(USE_CRL) && defined(STATISTICS)
  if (crl_self_addr == 0)
    printf("\n");
  crl_stats_print();
#endif

#if defined(CHECK_RESULT)
  c = compute_lu_product(a);
  compare_matrices(b, c);
#endif
}


#if defined(USE_CRL)
void lu_matrix(rid_t a_rid)
#else
void lu_matrix(double **a_dat)
#endif
{
  int     i, j, k;
  int     bbr, bbc;
  int     matsizblks;
  int     blkblkhght;
  int     blkblkwdth;
  int     blkblkrow;
  int     blkblkcol;
  int     row_start;
  int     col_start;
#if defined(USE_CRL)
  rid_t  *a_dat;
#endif
  double *pblk_dat;
  double *row_blk_dat;
  double *col_blk_dat;
  double *int_blk_dat;

  matsizblks = _matsizblks;
  blkblkhght = _blkblkhght;
  blkblkwdth = _blkblkwdth;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

#if defined(USE_CRL)
  a_dat = rgn_map(a_rid);
  rgn_start_read(a_dat);
#endif

  bbr = 0;
  bbc = 0;

  for (k=0; k<matsizblks; k++)
  {
#if defined(USE_CRL)
    pblk_dat = rgn_map(a_dat[(k*matsizblks)+k]);
#else
    pblk_dat = a_dat[(k*matsizblks)+k];
#endif

    sanity((k % blkblkwdth) == bbc);
    sanity((k % blkblkhght) == bbr);

    if (bbr == blkblkrow)
    {
      if (bbc == blkblkcol)
      {
	/* i own the pivot block
	 */
	sanity((k % blkblkhght) == blkblkrow);
	sanity((k % blkblkwdth) == blkblkcol);

#if defined(USE_CRL)
	rgn_start_write(pblk_dat);
#endif
	lu_block(pblk_dat);
#if defined(USE_CRL)
	rgn_end_write(pblk_dat);
#endif

	row_start = k + blkblkhght;
	col_start = k + blkblkwdth;

#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif

#if defined(USE_CRL)
	rgn_start_read(pblk_dat);
#endif

	/* update blocks in pivot row
	 */
	for (i=col_start; i<matsizblks; i+=blkblkwdth)
	{
	  sanity((k % blkblkhght) == blkblkrow);
	  sanity((i % blkblkwdth) == blkblkcol);

#if defined(USE_CRL)
	  row_blk_dat = rgn_map(a_dat[(k*matsizblks)+i]);
	  rgn_start_write(row_blk_dat);
#else
	  row_blk_dat = a_dat[(k*matsizblks)+i];
#endif

	  update_row_block(row_blk_dat, pblk_dat);

#if defined(USE_CRL)
	  rgn_end_write(row_blk_dat);
	  rgn_unmap(row_blk_dat);
#endif
	}

	/* update blocks in pivot column
	 */
	for (i=row_start; i<matsizblks; i+=blkblkhght)
	{
	  sanity((i % blkblkhght) == blkblkrow);
	  sanity((k % blkblkwdth) == blkblkcol);

#if defined(USE_CRL)
	  col_blk_dat = rgn_map(a_dat[(i*matsizblks)+k]);
	  rgn_start_write(col_blk_dat);
#else
	  col_blk_dat = a_dat[(i*matsizblks)+k];
#endif

	  update_col_block(col_blk_dat, pblk_dat);

#if defined(USE_CRL)
	  rgn_end_write(col_blk_dat);
	  rgn_unmap(col_blk_dat);
#endif
	}

#if defined(USE_CRL)
	rgn_end_read(pblk_dat);
	rgn_unmap(pblk_dat);
#endif

#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif
      }
      else
      {
	/* i own blocks in the pivot row,
	 * but not the pivot block itself
	 */
	row_start = k + blkblkhght;
	col_start = k + blkblkcol - bbc;
	if (blkblkcol < bbc)
	  col_start += blkblkwdth;

	/* wait until pivot block is factored
	 */
#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif

#if defined(USE_CRL)
	rgn_start_read(pblk_dat);
#endif

	/* update blocks in pivot row
	 */
	for (i=col_start; i<matsizblks; i+=blkblkwdth)
	{
	  sanity((k % blkblkhght) == blkblkrow);
	  sanity((i % blkblkwdth) == blkblkcol);

#if defined(USE_CRL)
	  row_blk_dat = rgn_map(a_dat[(k*matsizblks)+i]);
	  rgn_start_write(row_blk_dat);
#else
	  row_blk_dat = a_dat[(k*matsizblks)+i];
#endif

	  update_row_block(row_blk_dat, pblk_dat);

#if defined(USE_CRL)
	  rgn_end_write(row_blk_dat);
	  rgn_unmap(row_blk_dat);
#endif
	}

#if defined(USE_CRL)
	rgn_end_read(pblk_dat);
	rgn_unmap(pblk_dat);
#endif

#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif
      }
    }
    else
    {
      if (bbc == blkblkcol)
      {
	/* i own blocks in the pivot col,
	 * but not the pivot block itself
	 */
	row_start = k + blkblkrow - bbr;
	if (blkblkrow < bbr)
	  row_start += blkblkhght;
	col_start = k + blkblkwdth;

	/* wait until pivot block is factored
	 */
#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif

#if defined(USE_CRL)
	rgn_start_read(pblk_dat);
#endif

	/* update blocks in pivot column
	 */
	for (i=row_start; i<matsizblks; i+=blkblkhght)
	{
	  sanity((i % blkblkhght) == blkblkrow);
	  sanity((k % blkblkwdth) == blkblkcol);

#if defined(USE_CRL)
	  col_blk_dat = rgn_map(a_dat[(i*matsizblks)+k]);
	  rgn_start_write(col_blk_dat);
#else
	  col_blk_dat = a_dat[(i*matsizblks)+k];
#endif

	  update_col_block(col_blk_dat, pblk_dat);

#if defined(USE_CRL)
	  rgn_end_write(col_blk_dat);
	  rgn_unmap(col_blk_dat);
#endif
	}

#if defined(USE_CRL)
	rgn_end_read(pblk_dat);
	rgn_unmap(pblk_dat);
#endif

#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif
      }
      else
      {
	/* i don't own any blocks
	 * in the pivot row or col
	 */
	row_start = k + blkblkrow - bbr;
	if (blkblkrow < bbr)
	  row_start += blkblkhght;
	col_start = k + blkblkcol - bbc;
	if (blkblkcol < bbc)
	  col_start += blkblkwdth;

	/* wait until pivot block is factored
	 */
#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif

#if defined(USE_CRL)
	rgn_unmap(pblk_dat);
#endif

#if defined(USE_CRL)
	rgn_barrier();
#else
	shm_barrier();
#endif
      }
    }

    for (i=row_start; i<matsizblks; i+=blkblkhght)
    {
#if defined(USE_CRL)
      col_blk_dat = rgn_map(a_dat[(i*matsizblks)+k]);
      rgn_start_read(col_blk_dat);
#else
      col_blk_dat = a_dat[(i*matsizblks)+k];
#endif

      for (j=col_start; j<matsizblks; j+=blkblkwdth)
      {
#if defined(USE_CRL)
	int_blk_dat = rgn_map(a_dat[(i*matsizblks)+j]);
	rgn_start_write(int_blk_dat);
#else
	int_blk_dat = a_dat[(i*matsizblks)+j];
#endif

#if defined(USE_CRL)
	row_blk_dat = rgn_map(a_dat[(k*matsizblks)+j]);
	rgn_start_read(row_blk_dat);
#else
	row_blk_dat = a_dat[(k*matsizblks)+j];
#endif

	update_int_block(int_blk_dat, col_blk_dat, row_blk_dat);

#if defined(USE_CRL)
	rgn_end_read(row_blk_dat);
	rgn_unmap(row_blk_dat);
	rgn_end_write(int_blk_dat);
	rgn_unmap(int_blk_dat);
#endif	
      }

#if defined(USE_CRL)
      rgn_end_read(col_blk_dat);
      rgn_unmap(col_blk_dat);
#endif
    }

#if defined(USE_CRL)
    rgn_barrier();
#else
    shm_barrier();
#endif

    bbr += 1;
    if (bbr == blkblkhght) bbr = 0;

    bbc += 1;
    if (bbc == blkblkwdth) bbc = 0;
  }

#if defined(USE_CRL)
  rgn_end_read(a_dat);
  rgn_unmap(a_dat);
#endif
}


/* update a block (dst) in the pivot row (pblk is the pivot block)
 */
void update_row_block(double *dst, double *pblk)
{
  int     k, m, n;
  int     blksiz;
  double *row_k;
  double *row_m;
  double  tmp;

  blksiz = _blksiz;

  for (k=0; k<blksiz; k++)
  {
    /* pivot is pblk[k][k] */

    row_k = &(dst[(k*blksiz)+0]);

    /* update interior elements in dst
     */
    for (m=(k+1); m<blksiz; m++)
    {
      row_m = &(dst[(m*blksiz)+0]);
      tmp   = pblk[(m*blksiz)+k];

      for (n=0; n<blksiz; n++)
	row_m[n] -= tmp * row_k[n];
    }
  }
}


/* update a block (dst) in the pivot column (pblk is the pivot block)
 */
void update_col_block(double *dst, double *pblk)
{
  int     k, m, n;
  int     blksiz;
  double  pivot;
  double *row_m;
  double *row_k;
  double  tmp;

  blksiz = _blksiz;

  for (k=0; k<blksiz; k++)
  {
    /* pivot is pblk[k][k] */

    /* update pivot column elements in dst
     */
    row_k = &(pblk[(k*blksiz)+0]);
    pivot = row_k[k];
    tmp   = 1 / pivot;
    for (m=0; m<blksiz; m++)
      dst[(m*blksiz)+k] *= tmp;

    /* update interior elements in dst
     */
    for (m=0; m<blksiz; m++)
    {
      row_m = &(dst[(m*blksiz)+0]);
      tmp   = row_m[k];

      for (n=(k+1); n<blksiz; n++)
	row_m[n] -= tmp * row_k[n];
    }
  }
}


/* update a block (dst) in the interior (cblk and rblk are the
 * corresponding blocks in the pivot column and row, respectively)
 *
 * [could make this code much faster by unrolling all three loops
 * twice instead of just the innermost loop]
 */
void update_int_block(double *dst, double *cblk, double *rblk)
{
  int     i, j, k;
  int     blksiz;
  double  val;
  double *crow_i;
  double *rcol_j;

  blksiz = _blksiz;

  if (blksiz & 0x01)
  {
    for (i=0; i<blksiz; i++)
    {
      crow_i = &(cblk[(i*blksiz)+0]);

      for (j=0; j<blksiz; j++)
      {
	val    = dst[(i*blksiz)+j];
	rcol_j = &(rblk[(0*blksiz)+j]);

	val -= crow_i[0] * rcol_j[0];
	rcol_j += blksiz;

	for (k=1; k<blksiz; k+=2)
	{
	  val -= crow_i[k+0] * rcol_j[0];
	  val -= crow_i[k+1] * rcol_j[blksiz];
	  rcol_j += 2*blksiz;
	}

	dst[(i*blksiz)+j] = val;
      }
    }
  }
  else
  {
    for (i=0; i<blksiz; i++)
    {
      crow_i = &(cblk[(i*blksiz)+0]);

      for (j=0; j<blksiz; j++)
      {
	val    = dst[(i*blksiz)+j];
	rcol_j = &(rblk[(0*blksiz)+j]);

	for (k=0; k<blksiz; k+=2)
	{
	  val -= crow_i[k+0] * rcol_j[0];
	  val -= crow_i[k+1] * rcol_j[blksiz];
	  rcol_j += 2*blksiz;
	}

	dst[(i*blksiz)+j] = val;
      }
    }
  }
}


void lu_block(double *a)
{
  int     k, m, n;
  int     blksiz;
  double  pivot;
  double *row_m;
  double *row_k;
  double  tmp;

  blksiz = _blksiz;

  for (k=0; k<blksiz; k++)
  {
    row_k = &(a[(k*blksiz)+0]);
    pivot = row_k[k];

    tmp = 1 / pivot;
    m   = k + 1;

    if ((blksiz - m) & 0x01)
    {
      a[(m*blksiz)+k] *= tmp;
      m += 1;
    }

    for (; m<blksiz; m+=2)
    {
      a[((m+0)*blksiz)+k] *= tmp;
      a[((m+1)*blksiz)+k] *= tmp;
    }

    for (m=(k+1); m<blksiz; m++)
    {
      row_m = &(a[(m*blksiz)+0]);

      tmp = row_m[k];
      n   = k + 1;

      if ((blksiz - n) & 0x01)
      {
	row_m[n] -= tmp * row_k[n];
	n += 1;
      }

      for (; n<blksiz; n+=2)
      {
	row_m[n+0] -= tmp * row_k[n+0];
	row_m[n+1] -= tmp * row_k[n+1];
      }
    }
  }
}


#if defined(USE_CRL)
rid_t compute_lu_product(rid_t a)
#else
double **compute_lu_product(double **a_dat)
#endif
{
  int      i, j, k;
  int      matsizblks;
  int      blkblkwdth;
  int      blkblkhght;
  int      blkblkrow;
  int      blkblkcol;
#if defined(USE_CRL)
  rid_t   *a_dat;
#endif
  double  *dst_dat;
  double  *l_dat;
  double  *u_dat;
#if defined(USE_CRL)
  rid_t    rslt_rid;
  rid_t   *rslt_dat;
#else
  double **rslt_dat;
#endif

  matsizblks = _matsizblks;
  blkblkwdth = _blkblkwdth;
  blkblkhght = _blkblkhght;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

#if defined(USE_CRL)
  a_dat = rgn_map(a);
  rgn_start_read(a_dat);
#endif

#if defined(USE_CRL)
  rslt_rid = zero_matrix();
  rslt_dat = rgn_map(rslt_rid);
  rgn_start_read(rslt_dat);
#else
  rslt_dat = zero_matrix();
#endif

  for (i=blkblkrow; i<matsizblks; i+=blkblkhght)
    for (j=blkblkcol; j<matsizblks; j+=blkblkwdth)
    {
#if defined(USE_CRL)
      dst_dat = rgn_map(rslt_dat[(i*matsizblks)+j]);
      rgn_start_write(dst_dat);
#else
      dst_dat = rslt_dat[(i*matsizblks)+j];
#endif

      for (k=0; k<matsizblks; k++)
      {
	/* want to compute
	 *
	 * rslt[i][j] += L_a[i][k] * U_a[k][j]
	 *
	 * L_a[i][k] is one of
	 *  - zero             (i < k)
	 *  - lower triangular (i == k)
	 *  - complete         (i > k)
	 *
	 * U_a[k][j] is one of
	 *  - complete         (k < j)
	 *  - upper triangular (k == j)
	 *  - zero             (k > j)
	 */

#if defined(USE_CRL)
	l_dat = rgn_map(a_dat[(i*matsizblks)+k]);
	rgn_start_read(l_dat);
#else
	l_dat = a_dat[(i*matsizblks)+k];
#endif

#if defined(USE_CRL)
	u_dat = rgn_map(a_dat[(k*matsizblks)+j]);
	rgn_start_read(u_dat);
#else
	u_dat = a_dat[(k*matsizblks)+j];
#endif

	if (i > k)
	{
	  if (k < j)
	  {
	    /* both L_a[i][k] and U_a[k][j] are complete
	     */
	    lu_prod_both_complete(dst_dat, l_dat, u_dat);
	  }
	  else if (k == j)
	  {
	    /* L_a[i][k] is complete, U_a[k][j] is upper triangular
	     */
	    lu_prod_complete_upper(dst_dat, l_dat, u_dat);
	  }
	}
	else if (i == k)
	{
	  if (k < j)
	  {
	    /* L_a[i][k] is lower triangular, U_a[k][j] is complete
	     */
	    lu_prod_lower_complete(dst_dat, l_dat, u_dat);
	  }
	  else if (k == j)
	  {
	    /* L_a[i][k] is lower triangular, U_a[k][j] is upper triangular
	     */
	    lu_prod_lower_upper(dst_dat, l_dat, u_dat);
	  }
	}

#if defined(USE_CRL)
	rgn_end_read(u_dat);
	rgn_unmap(u_dat);
	rgn_end_read(l_dat);
	rgn_unmap(l_dat);
#endif
      }

#if defined(USE_CRL)
      rgn_end_write(dst_dat);
      rgn_unmap(dst_dat);
#endif
    }

#if defined(USE_CRL)
  rgn_end_read(rslt_dat);
  rgn_unmap(rslt_dat);
#endif

#if defined(USE_CRL)
  rgn_end_read(a_dat);
  rgn_unmap(a_dat);
#endif

#if defined(USE_CRL)
  return rslt_rid;
#else
  return rslt_dat;
#endif
}


void lu_prod_both_complete(double *dst, double *l, double *u)
{
  int    i, j, k;
  int    blksiz;
  double val;

  blksiz = _blksiz;

  for (i=0; i<blksiz; i++)
    for (j=0; j<blksiz; j++)
    {
      val = 0;
      for (k=0; k<blksiz; k++)
	val += l[(i*blksiz)+k] * u[(k*blksiz)+j];
      dst[(i*blksiz)+j] += val;
    }
}


void lu_prod_complete_upper(double *dst, double *l, double *u)
{
  int    i, j, k;
  int    blksiz;
  double val;

  blksiz = _blksiz;

  for (i=0; i<blksiz; i++)
    for (j=0; j<blksiz; j++)
    {
      val = 0;
      for (k=0; k<blksiz; k++)
      {
	if (k <= j)
	  val += l[(i*blksiz)+k] * u[(k*blksiz)+j];
      }
      dst[(i*blksiz)+j] += val;
    }
}


void lu_prod_lower_complete(double *dst, double *l, double *u)
{
  int    i, j, k;
  int    blksiz;
  double val;

  blksiz = _blksiz;

  for (i=0; i<blksiz; i++)
    for (j=0; j<blksiz; j++)
    {
      val = 0;
      for (k=0; k<blksiz; k++)
      {
	if (i > k)
	  val += l[(i*blksiz)+k] * u[(k*blksiz)+j];
	else if (i == k)
	  val += u[(k*blksiz)+j];
      }
      dst[(i*blksiz)+j] += val;
    }
}


void lu_prod_lower_upper(double *dst, double *l, double *u)
{
  int    i, j, k;
  int    blksiz;
  double val;

  blksiz = _blksiz;

  for (i=0; i<blksiz; i++)
    for (j=0; j<blksiz; j++)
    {
      val = 0;
      for (k=0; k<blksiz; k++)
      {
	if (k <= j)
	{
	  if (i > k)
	    val += l[(i*blksiz)+k] * u[(k*blksiz)+j];
	  else if (i == k)
	    val += u[(k*blksiz)+j];
	}
      }
      dst[(i*blksiz)+j] += val;
    }
}


#if defined(USE_CRL)
rid_t zero_matrix(void)
#else
double **zero_matrix(void)
#endif
{
  int      i, r, c;
  int      matsizblks;
  int      eltsperblk;
  int      blkblkwdth;
  int      blkblkhght;
  int      blkblkrow;
  int      blkblkcol;
#if defined(USE_CRL)
  rid_t    dst_mat_rid;
  rid_t   *dst_mat_dat;
#else
  double **dst_mat_dat;
#endif
  double  *dst_blk_dat;

  matsizblks = _matsizblks;
  eltsperblk = _eltsperblk;
  blkblkwdth = _blkblkwdth;
  blkblkhght = _blkblkhght;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

#if defined(USE_CRL)
  dst_mat_rid = alloc_matrix();
  dst_mat_dat = rgn_map(dst_mat_rid);
  rgn_start_read(dst_mat_dat);
#else
  dst_mat_dat = alloc_matrix();
#endif

  for (r=blkblkrow; r<matsizblks; r+=blkblkhght)
    for (c=blkblkcol; c<matsizblks; c+=blkblkwdth)
    {
#if defined(USE_CRL)
      dst_blk_dat = rgn_map(dst_mat_dat[(r*matsizblks)+c]);
      rgn_start_write(dst_blk_dat);
#else
      dst_blk_dat = dst_mat_dat[(r*matsizblks)+c];
#endif

      for (i=0; i<eltsperblk; i++)
	dst_blk_dat[i] = 0;

#if defined(USE_CRL)
      rgn_end_write(dst_blk_dat);
      rgn_unmap(dst_blk_dat);
#endif
    }      

#if defined(USE_CRL)
  rgn_end_read(dst_mat_dat);
  rgn_unmap(dst_mat_dat);
#endif

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

#if defined(USE_CRL)
  return dst_mat_rid;
#else
  return dst_mat_dat;
#endif
}


#if defined(USE_CRL)
rid_t gen_matrix(void)
#else
double **gen_matrix(void)
#endif
{
  int      i, r, c;
  int      matsizblks;
  int      eltsperblk;
  int      blkblkwdth;
  int      blkblkhght;
  int      blkblkrow;
  int      blkblkcol;
  unsigned state;
  double   val;
#if defined(USE_CRL)
  rid_t    dst_mat_rid;
  rid_t   *dst_mat_dat;
#else
  double **dst_mat_dat;
#endif
  double  *dst_blk_dat;

  matsizblks = _matsizblks;
  eltsperblk = _eltsperblk;
  blkblkwdth = _blkblkwdth;
  blkblkhght = _blkblkhght;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

#if defined(USE_CRL)
  state = fast_random(1009*crl_self_addr + 101);
#else
  state = fast_random(1009*MY_PID + 101);
#endif

#if defined(USE_CRL)
  dst_mat_rid = alloc_matrix();
  dst_mat_dat = rgn_map(dst_mat_rid);
  rgn_start_read(dst_mat_dat);
#else
  dst_mat_dat = alloc_matrix();
#endif

  for (r=blkblkrow; r<matsizblks; r+=blkblkhght)
    for (c=blkblkcol; c<matsizblks; c+=blkblkwdth)
    {
#if defined(USE_CRL)
      dst_blk_dat = rgn_map(dst_mat_dat[(r*matsizblks)+c]);
      rgn_start_write(dst_blk_dat);
#else
      dst_blk_dat = dst_mat_dat[(r*matsizblks)+c];
#endif

      for (i=0; i<eltsperblk; i++)
      {
	state = fast_random(state);
	val   = state / 2147483647.0;
	state = fast_random(state);
	val  += state / 2147483647.0;
	val  /= 2;
	dst_blk_dat[i] = val;
      }

#if defined(USE_CRL)
      rgn_end_write(dst_blk_dat);
      rgn_unmap(dst_blk_dat);
#endif
    }      

#if defined(USE_CRL)
  rgn_end_read(dst_mat_dat);
  rgn_unmap(dst_mat_dat);
#endif

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

#if defined(USE_CRL)
  return dst_mat_rid;
#else
  return dst_mat_dat;
#endif
}


#if defined(USE_CRL)
rid_t copy_matrix(rid_t src_mat_rid)
#else
double **copy_matrix(double **src_mat_dat)
#endif
{
  int     i, r, c;
  int      matsizblks;
  int      eltsperblk;
  int      blkblkwdth;
  int      blkblkhght;
  int      blkblkrow;
  int      blkblkcol;
#if defined(USE_CRL)
  rid_t   *src_mat_dat;
  rid_t    dst_mat_rid;
  rid_t   *dst_mat_dat;
#else
  double **dst_mat_dat;
#endif
  double *src_blk_dat;
  double *dst_blk_dat;

  matsizblks = _matsizblks;
  eltsperblk = _eltsperblk;
  blkblkwdth = _blkblkwdth;
  blkblkhght = _blkblkhght;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

#if defined(USE_CRL)
  src_mat_dat = rgn_map(src_mat_rid);
  rgn_start_read(src_mat_dat);
#endif

#if defined(USE_CRL)
  dst_mat_rid = alloc_matrix();
  dst_mat_dat = rgn_map(dst_mat_rid);
  rgn_start_read(dst_mat_dat);
#else
  dst_mat_dat = alloc_matrix();
#endif

  for (r=blkblkrow; r<matsizblks; r+=blkblkhght)
    for (c=blkblkcol; c<matsizblks; c+=blkblkwdth)
    {
#if defined(USE_CRL)
      src_blk_dat = rgn_map(src_mat_dat[(r*matsizblks)+c]);
      rgn_start_read(src_blk_dat);
#else
      src_blk_dat = src_mat_dat[(r*matsizblks)+c];
#endif

#if defined(USE_CRL)
      dst_blk_dat = rgn_map(dst_mat_dat[(r*matsizblks)+c]);
      rgn_start_write(dst_blk_dat);
#else
      dst_blk_dat = dst_mat_dat[(r*matsizblks)+c];
#endif

      for (i=0; i<eltsperblk; i++)
	dst_blk_dat[i] = src_blk_dat[i];

#if defined(USE_CRL)
      rgn_end_write(dst_blk_dat);
      rgn_unmap(dst_blk_dat);
      rgn_end_read(src_blk_dat);
      rgn_unmap(src_blk_dat);
#endif
    }      

#if defined(USE_CRL)
  rgn_end_read(dst_mat_dat);
  rgn_unmap(dst_mat_dat);
  rgn_end_read(src_mat_dat);
  rgn_unmap(src_mat_dat);
#endif

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

#if defined(USE_CRL)
  return dst_mat_rid;
#else
  return dst_mat_dat;
#endif
}


#if defined(USE_CRL)
rid_t alloc_matrix(void)
#else
double **alloc_matrix(void)
#endif
{
  int      r, c;
  int      blkspermat;
  int      matsizblks;
  int      eltsperblk;
  int      blkblkwdth;
  int      blkblkhght;
  int      blkblkrow;
  int      blkblkcol;
#if defined(USE_CRL)
  rid_t    rslt_rid;
  rid_t   *rslt_dat;
#else
  double  *new;
  double **rslt_dat;
#endif

  blkspermat = _blkspermat;
  matsizblks = _matsizblks;
  eltsperblk = _eltsperblk;
  blkblkwdth = _blkblkwdth;
  blkblkhght = _blkblkhght;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

#if defined(USE_CRL)

  if (crl_self_addr == 0)
  {
    rslt_rid = rgn_create(sizeof(rid_t) * blkspermat);
    rgn_bcast_send(sizeof(rid_t), &rslt_rid);
  }
  else
  {
    rgn_bcast_recv(sizeof(rid_t), &rslt_rid);
  }

  rslt_dat = rgn_map(rslt_rid);
  rgn_start_write(rslt_dat);

  for (r=blkblkrow; r<matsizblks; r+=blkblkhght)
    for (c=blkblkcol; c<matsizblks; c+=blkblkwdth)
      rslt_dat[(r*matsizblks)+c] = rgn_create(sizeof(double) * eltsperblk);

  rgn_end_write(rslt_dat);
  rgn_unmap(rslt_dat);

#else

  rslt_dat = (double **) malloc(sizeof(double *) * blkspermat);
  assert(rslt_dat != NULL);

  for (r=0; r<matsizblks; r++)
    for (c=0; c<matsizblks; c++)
    {
      if (((r % blkblkhght) == blkblkrow) &&
	  ((c % blkblkwdth) == blkblkcol))
      {
	new = (double *) malloc(sizeof(double) * eltsperblk);
	shm_bcast_send_p((void *) new);
      }
      else
      {
	new = shm_bcast_recv_p();
      }

      rslt_dat[(r*matsizblks)+c] = new;
    }

#endif

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

#if defined(USE_CRL)
  return rslt_rid;
#else
  return rslt_dat;
#endif
}


#if defined(USE_CRL)
void compare_matrices(rid_t a, rid_t b)
#else
void compare_matrices(double **a_mat_dat, double **b_mat_dat)
#endif
{
  int     r, c;
  int     matsizblks;
  int     eltspermat;
  int     blkblkwdth;
  int     blkblkhght;
  int     blkblkrow;
  int     blkblkcol;
  int     self;
#if defined(USE_CRL)
  rid_t  *a_mat_dat;
  rid_t  *b_mat_dat;
  rid_t   a_blk;
  rid_t   b_blk;
#else
  double *a_blk;
  double *b_blk;
#endif
  double max;
  double sum;

  matsizblks = _matsizblks;
  eltspermat = _eltspermat;
  blkblkwdth = _blkblkwdth;
  blkblkhght = _blkblkhght;
  blkblkrow  = _blkblkrow;
  blkblkcol  = _blkblkcol;

  max = 0;
  sum = 0;

#if defined(USE_CRL)
  a_mat_dat = rgn_map(a);
  rgn_start_read(a_mat_dat);
  b_mat_dat = rgn_map(b);
  rgn_start_read(b_mat_dat);
#endif

  for (r=blkblkrow; r<matsizblks; r+=blkblkhght)
    for (c=blkblkcol; c<matsizblks; c+=blkblkwdth)
    {
      a_blk = a_mat_dat[(r*matsizblks)+c];
      b_blk = b_mat_dat[(r*matsizblks)+c];
      compare_blocks(a_blk, b_blk, &max, &sum);
    }

#if defined(USE_CRL)
  rgn_end_read(b_mat_dat);
  rgn_unmap(b_mat_dat);
  rgn_end_read(a_mat_dat);
  rgn_unmap(a_mat_dat);
#endif

#if defined(USE_CRL)
  sum  = rgn_reduce_dadd(sum);
  max  = rgn_reduce_dmax(max);
  self = crl_self_addr;
#else
  sum  = shm_reduce_dadd(sum);
  max  = shm_reduce_dmax(max);
  self = MY_PID;
#endif

  if (self == 0)
  {
    sum /= eltspermat;
    printf("max error  %e\n", max);
    printf("mean error %e\n", sum);
  }
}


#if defined(USE_CRL)
void compare_blocks(rid_t a, rid_t b, double *max, double *sum)
#else
void compare_blocks(double *a_dat, double *b_dat, double *max, double *sum)
#endif
{
  int     i;
  int     eltsperblk;
#if defined(USE_CRL)
  double *a_dat;
  double *b_dat;
#endif
  double  val;
  double  _max;
  double  _sum;

  eltsperblk = _eltsperblk;

#if defined(USE_CRL)
  a_dat = rgn_map(a);
  rgn_start_read(a_dat);
  b_dat = rgn_map(b);
  rgn_start_read(b_dat);
#endif

  _max = 0;
  _sum = 0;

  for (i=0; i<eltsperblk; i++)
  {
    val = a_dat[i] - b_dat[i];
    if (val < 0) val = - val;

    if (val > _max) _max = val;
    _sum += val;
  }

  if (_max > *max) *max = _max;
  *sum += _sum;

#if defined(USE_CRL)
  rgn_end_read(b_dat);
  rgn_unmap(b_dat);
  rgn_end_read(a_dat);
  rgn_unmap(a_dat);
#endif
}


unsigned fast_random(unsigned state)
{
  unsigned lo, hi;
  int      test;

  hi = state / ((unsigned) 127773);
  lo = state % ((unsigned) 127773);
  test = (((unsigned) 16807) * lo) - (((unsigned) 2836) * hi);
  if (test > 0)
    state = test;
  else
    state = test + ((unsigned) 2147483647);

  return state;
}


#if defined(ALEWIFE) || defined(TCPUNIX) || defined(NULLCRL)
static double _timer_start;
#endif

void timer_clear_and_start(void)
{
#if defined(CM5)
  CMMD_node_timer_clear(0);
  CMMD_sync_with_nodes();
  CMMD_node_timer_start(0);
#elif defined(ALEWIFE)
#if defined(USE_CRL)
  crl_prof_reset();
#endif
  mp_spin_barrier();
  _timer_start = get_time();
#else
  struct timeval tp;
  gettimeofday(&tp, NULL);
  _timer_start = tp.tv_sec + (tp.tv_usec * 1e-6);
#endif
}

void timer_stop_and_print(void)
{
  int    self;
  int    nprocs;
  double t1;
  double t2;
  double avg;
  double std;

#if defined(CM5)
  CMMD_node_timer_stop(0);
  t1 = CMMD_node_timer_elapsed(0); /* seconds */
#elif defined(ALEWIFE)
  t1 = get_time();
  t1 = (t1 - _timer_start) * 1e-6; /* Mcycles */
#else
  struct timeval tp;
  gettimeofday(&tp,NULL);
  t1  = tp.tv_sec + (tp.tv_usec * 1e-6);
  t1 -= _timer_start;		/* seconds */
#endif

#if defined(USE_CRL)
  self   = crl_self_addr;
  nprocs = crl_num_nodes;
#else
  self   = MY_PID;
  nprocs = NPROCS; 
#endif

  t2 = t1 * t1;
#if defined(USE_CRL)
  t1 = rgn_reduce_dadd(t1);
  t2 = rgn_reduce_dadd(t2);
#else
  t1 = shm_reduce_dadd(t1);
  t2 = shm_reduce_dadd(t2);
#endif

  avg = t1 / nprocs;
  std = sqrt((t2 / nprocs) - (avg * avg));

  if (self == 0)
  {
    printf(" %.3f +- %.3f  ", avg, std);
    printf(" %u %.6f %.6f\n", nprocs, t1, t2);
  }

#if defined(ALEWIFE) && defined(USE_CRL)
  crl_prof_print();
#endif
}


#if !defined(USE_CRL)

/* support for global synchronization under shared memory
 */

#define HackMaxProcs (128)

static double _shm_dmin(double, double);
static double _shm_dmax(double, double);

shared void      *shm_global_tmp[HackMaxProcs];
static red_tree_p shm_global_sync;

/* sequential part of shm_sync_init
 * [only run on processor 0]
 */
void shm_sync_seq_init(void)
{
  int        i;
  int        self;
  int        nprocs;
  red_tree_p sync;

  self   = MY_PID;
  nprocs = NPROCS;

  assert(self == 0);
  assert(nprocs < HackMaxProcs);
  sync = make_global_reduction_tree();

  for (i=0; i<nprocs; i++)
    shm_global_tmp[i] = (void *) sync;
}


/* parallel part of shm_sync_init
 * [run on all processors]
 */
void shm_sync_par_init(void)
{
  int self;

  self = MY_PID;
  shm_global_sync = (red_tree_p) shm_global_tmp[self];
  assert(shm_global_sync != NULL);
}


void shm_barrier(void)
{
  SM_TREE_BARRIER(MY_PID, shm_global_sync);
}


void shm_bcast_send_p(void *val)
{
  int i;
  int nprocs;

  nprocs = NPROCS;
  for (i=0; i<nprocs; i++)
    shm_global_tmp[i] = val;

  SM_TREE_BARRIER(MY_PID, shm_global_sync);
  SM_TREE_BARRIER(MY_PID, shm_global_sync);
}


void *shm_bcast_recv_p(void)
{
  int   self;
  void *rslt;

  self = MY_PID;
  SM_TREE_BARRIER(self, shm_global_sync);
  rslt = shm_global_tmp[self];
  SM_TREE_BARRIER(self, shm_global_sync);

  return rslt;
}


double shm_reduce_dmin(double val)
{
  double rslt;

  rslt = REDUCE(MY_PID, shm_global_sync, _shm_dmin, val);

  return rslt;
}


double shm_reduce_dmax(double val)
{
  double rslt;

  rslt = REDUCE(MY_PID, shm_global_sync, _shm_dmax, val);

  return rslt;
}


double shm_reduce_dadd(double val)
{
  double rslt;

  rslt = REDUCE_ADD(MY_PID, shm_global_sync, val);

  return rslt;
}


static double _shm_dmin(double a, double b)
{
  double rslt;

  rslt = (a < b) ? a : b;

  return rslt;
}


static double _shm_dmax(double a, double b)
{
  double rslt;

  rslt = (a > b) ? a : b;

  return rslt;
}

#endif
