/*************************************************************************/
/*                                                                       */
/*  Copyright (c) 1994 Stanford University                               */
/*                                                                       */
/*  All rights reserved.                                                 */
/*                                                                       */
/*  Permission is given to use, copy, and modify this software for any   */
/*  non-commercial purpose as long as this copyright notice is not       */
/*  removed.  All other uses, including redistribution in whole or in    */
/*  part, are forbidden without prior written permission.                */
/*                                                                       */
/*  This software is provided with absolutely no warranty and no         */
/*  support.                                                             */
/*                                                                       */
/* --------------------------------------------------------------------- */
/*                                                                       */
/*  Modifications of the original Barnes-Hut code (as taken from         */
/*  Stanford's SPLASH-2 distribution) to allow use on Alewife and        */
/*  with CRL are copyright:                                              */
/*                                                                       */
/*  Copyright (C) 1995 Massachusetts Institute of Technology             */
/*                                                                       */
/*************************************************************************/

#include "code.h"
#include <math.h>

/*#define STATISTICS*/

extern double xrand(double, double);

void barnes_worker(int, int);
void command_line(int, char **);
void set_params(void); 
void global_init(void);
void local_init(void);
void alloc_bodytab(void);
void tab_init(void);
void SlaveStart(void);
void stepsystem(unsigned int);
void init_root(unsigned);
void ComputeForces(unsigned int);
#if defined(USE_CRL)
void find_my_initial_bodies(rid_t *, int, unsigned);
#else
void find_my_initial_bodies(bodyptr *, int, unsigned);
#endif
void find_my_bodies(nodeptr, int, int, unsigned);
void Housekeep(unsigned);
void testdata(void);
void pickshell(real [], real);
void setbound(void);

int desired_nproc;

real dtime;			/* timestep for leapfrog integrator */
real dtout;			/* time between data outputs */
real tstop;			/* time to stop calculation */
int  nbody;			/* number of bodies in system */
real fcells;			/* ratio of cells/leaves allocated */
real fleaves;			/* ratio of leaves/bodies allocated */
real tol;			/* accuracy parameter: 0.0 => exact */
real tolsq;			/* square of previous */
real eps;			/* potential softening parameter */
real epssq;			/* square of previous */
real dthf;			/* half time step */

int      maxcell;		/* max number of cells allocated */
int      maxleaf;		/* max number of leaves allocated */
int      maxmybody;		/* max no. of bodies allocated per processor */
int      maxmycell;		/* max num. of cells to be allocated */
int      maxmyleaf;		/* max num. of leaves to be allocated */
#if defined(USE_CRL)
rid_t   *bodytab;		/* array size is exactly nbody bodies */
#else
bodyptr *bodytab;
#endif
cellptr  g_root;		/* global tree root */
vector   rmin;			/* lower-left corner of coordinate box */
real     rsize;			/* side-length of integer coordinate box */

struct GlobalMemory *Global;
struct LocalMemory   Local;


#if defined(TCPUNIX)
extern char *GROUP;
int main2(int argc, char **argv)
#else
int main(int argc, char **argv)
#endif
{
#if defined(CM5)
  CMMD_set_io_mode(0, CMMD_independent);
  CMMD_set_io_mode(1, CMMD_independent);
  CMMD_set_io_mode(2, CMMD_independent);
#endif

#if !defined(USE_CRL)
  shm_sync_seq_init();
#endif

  command_line(argc, argv);

#if defined(ALEWIFE)
  do_in_parallel(barnes_worker, desired_nproc, nbody);
#else 
  barnes_worker(desired_nproc, nbody);
#endif

  return 0;
}


void barnes_worker(int nproc_arg, int nbody_arg)
{
  unsigned ProcessId;

  desired_nproc = nproc_arg;
  nbody         = nbody_arg;

#if defined(CM5)
  CMMD_reset_partition_size(desired_nproc);
#endif

#if !defined(USE_CRL)
  shm_sync_par_init();
#endif

  set_params();

#if defined(USE_CRL)
#if defined(TCPUNIX)
  crl_init(GROUP);
#else
  crl_init();
#endif
  assert(desired_nproc == crl_num_nodes);
  ProcessId = crl_self_addr;
#else
  assert(desired_nproc == NPROCS);
  ProcessId = MY_PID;
#endif

  global_init();
  local_init();
  alloc_bodytab();

  if (ProcessId == 0)
  {
    testdata();
    initoutput();
  }

  tab_init();

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

  /* NOTE: set Global->computestart here [only processor 0?]
   *
   * CLOCK(Global->computestart);
   * printf("COMPUTESTART  = %12u\n",Global->computestart);
   */

  SlaveStart();

  /* NOTE: set Global->computeend here [only processor 0?]
   *
   * CLOCK(Global->computeend);
   */

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

  /* NOTE: print final timing info here [only processor 0?]
   *
   * printf("COMPUTEEND    = %12u\n",Global->computeend);
   * printf("COMPUTETIME   = %12u\n",Global->computeend - Global->computestart);
   * printf("TRACKTIME     = %12u\n",Global->tracktime); 
   * printf("PARTITIONTIME = %12u\t%5.2f\n",Global->partitiontime,
   *        ((float)Global->partitiontime)/Global->tracktime);
   * printf("TREEBUILDTIME = %12u\t%5.2f\n",Global->treebuildtime, 
   *        ((float)Global->treebuildtime)/Global->tracktime);
   * printf("FORCECALCTIME = %12u\t%5.2f\n",Global->forcecalctime,
   *        ((float)Global->forcecalctime)/Global->tracktime);
   * printf("RESTTIME      = %12u\t%5.2f\n",
   *        Global->tracktime - Global->partitiontime - 
   *        Global->treebuildtime - Global->forcecalctime, 
   *        ((float)(Global->tracktime-Global->partitiontime-
   *        Global->treebuildtime-Global->forcecalctime))/
   *        Global->tracktime);
   */

#if defined(USE_CRL) && defined(STATISTICS)
  if (crl_self_addr == 0)
    printf("\n");
  crl_stats_print();
#endif
}


/* extract args from command line 
 */

void command_line(int argc, char **argv)
{
  if (argc != 3)
  {
#if defined(CM5)
    if (CMMD_self_address() == 0)
      fprintf(stderr, "usage: %s <nproc> <nbody>\n", argv[0]);
#else
    fprintf(stderr, "usage: %s <nproc> <nbody>\n", argv[0]);
#endif
    exit(1);
  }

  sscanf(argv[1], "%d", &desired_nproc);
  sscanf(argv[2], "%d", &nbody);

#if defined(CM5)
  assert(desired_nproc > 0);
  assert(desired_nproc <= CMMD_partition_size());
#endif

  assert(nbody > 0);
}


/* set default parameter values
 */
void set_params(void)
{
  int    seed;
  double scale;

  /* set default parameter values
   */
  seed    = 123;
  dtime   = 0.025;
  eps     = 0.05;
  tol     = 1.0;
  fcells  = 2.0;
  fleaves = 1.0;
  tstop   = 0.075;
  dtout   = 0.25;

  /* compute derived quantities
   */
  dthf  = 0.5 * dtime;
  epssq = eps * eps;
  tolsq = tol * tol;

  /* scale dtime and tol according to nbody
   * (per scaling rules in splash TR)
   */
  scale  = pow(((double) nbody / 16384.0), -0.25);
  dtime *= scale;
  tol   *= scale;

  pranset(seed);
}


/* initialize global data
 */

void global_init(void)
{
  unsigned ProcessId;
#if defined(USE_CRL)
  rid_t    new;
#endif

#if defined(USE_CRL)
  ProcessId = crl_self_addr;
#else
  ProcessId = MY_PID;
#endif

  if (ProcessId == 0)
  {
#if defined(USE_CRL)
    new = rgn_create(sizeof(struct GlobalMemory));
    Global = (struct GlobalMemory *) rgn_map(new);
    rgn_start_write(Global);
#else
    Global = (struct GlobalMemory *) malloc(sizeof(struct GlobalMemory));
#endif

    /* initialize stuff in Global here
     */
    bzero(Global, sizeof(struct GlobalMemory));
#if !defined(USE_CRL)
    init_lock(&(Global->CountLock));
#endif

#if defined(USE_CRL)
    rgn_end_write(Global);
    rgn_bcast_send(sizeof(rid_t), &new);
#else
    shm_bcast_send_p((void *) Global);
#endif
  }
  else
  {
#if defined(USE_CRL)
    rgn_bcast_recv(sizeof(rid_t), &new);
    Global = (struct GlobalMemory *) rgn_map(new);
#else
    Global = (struct GlobalMemory *) shm_bcast_recv_p();
#endif
  }
}


/* initialize local data
 */

void local_init(void)
{
  Local.nstep = 0;
  Local.tnow  = 0.0;
  Local.tout  = Local.tnow + dtout;
}


/* allocate bodytab (interleave elements across all processors)
 */

void alloc_bodytab(void)
{
  int      i, j;
  unsigned ProcessId;
  unsigned nprocs;
#if defined(USE_CRL)
  rid_t    new;
#else
  bodyptr  new;
#endif

#if defined(USE_CRL)
  bodytab = (rid_t *) safe_malloc(sizeof(rid_t) * nbody);
#else
  bodytab = (bodyptr *) malloc(sizeof(bodyptr) * nbody);
#endif
  assert(bodytab != NULL);

#if defined(USE_CRL)
  ProcessId = crl_self_addr;
  nprocs    = crl_num_nodes;
#else
  ProcessId = MY_PID;
  nprocs    = NPROCS;
#endif

  for (i=0; i<nbody; i+=nprocs)
  {
    if ((i+ProcessId) < nbody)
    {
#if defined(USE_CRL)
      new = rgn_create(sizeof(body));
#else
      new = (bodyptr) malloc(sizeof(body));
#endif
    }

    for (j=0; j<nprocs; j++)
      if ((i+j) < nbody)
      {
	if (j == ProcessId)
	{
	  bodytab[i+j] = new;

#if defined(USE_CRL)
	  rgn_bcast_send(sizeof(rid_t), &new);
#else
	  shm_bcast_send_p((void *) new);
#endif
	}
	else
	{
#if defined(USE_CRL)
	  rgn_bcast_recv(sizeof(rid_t), &(bodytab[i+j]));
#else
	  bodytab[i+j] = (bodyptr) shm_bcast_recv_p();
#endif
	}
      }
  }
}


/* allocate body and cell data space
 */

void tab_init(void)
{
  int i;
  int nprocs;

#if defined(USE_CRL)
  nprocs  = crl_num_nodes;
#else
  nprocs  = NPROCS;
#endif
  maxleaf = (int) ((double) fleaves * nbody);
  maxcell = fcells * maxleaf;

  maxmycell = maxcell / nprocs;
  maxmyleaf = maxleaf / nprocs;
  maxmybody = (nbody + (maxleaf * MAX_BODIES_PER_LEAF)) / nprocs;

  /* allocate cell space
   */
#if defined(USE_CRL)
  Local.ctab = (rid_t *) safe_malloc(sizeof(rid_t) * maxmycell);
  for (i=0; i<maxmycell; i++)
    Local.ctab[i] = rgn_create(sizeof(cell));
#else
  Local.ctab = (cellptr) malloc(sizeof(cell) * maxmycell);
  for (i=0; i<maxmycell; i++)
    init_lock(&(Local.ctab[i].cell_lock));
#endif

  /* allocate leaf space
   */
#if defined(USE_CRL)
  Local.ltab = (rid_t *) safe_malloc(sizeof(rid_t) * maxmyleaf);
  for (i=0; i<maxmyleaf; i++)
    Local.ltab[i] = rgn_create(sizeof(leaf));
#else
  Local.ltab = (leafptr) malloc(sizeof(leaf) * maxmyleaf);
#endif

  /* allocate space for personal lists of body, cell, and leaf
   * pointers
   */
#if defined(USE_CRL)
  Local.mybodytab = (rid_t *) safe_malloc(sizeof(rid_t) * maxmybody);
  Local.mycelltab = (rid_t *) safe_malloc(sizeof(rid_t) * maxmycell);
  Local.myleaftab = (rid_t *) safe_malloc(sizeof(rid_t) * maxmyleaf);
#else
  Local.mybodytab = (bodyptr *) malloc(sizeof(bodyptr) * maxmybody);
  Local.mycelltab = (cellptr *) malloc(sizeof(cellptr) * maxmycell);
  Local.myleaftab = (leafptr *) malloc(sizeof(leafptr) * maxmyleaf);
#endif

  assert(Local.mybodytab != NULL);
  assert(Local.mycelltab != NULL);
  assert(Local.myleaftab != NULL);
}


/* main task for each processor
 */

void SlaveStart(void)
{
  int i;
  int ProcessId;

#if defined(USE_CRL)
  ProcessId = crl_self_addr;
#else
  ProcessId = MY_PID;
#endif

  find_my_initial_bodies(bodytab, nbody, ProcessId);
  setbound();

  /* main loop
   */
  for (i=0; i<4; i++)
  {
    stepsystem(ProcessId);

#if defined(USE_CRL) && defined(STATISTICS)
    if (Local.nstep == 2)
    {
      if (crl_self_addr == 0)
	printf("\n[reseting statistics]\n\n");
      crl_stats_reset();
    }
#endif
  }
}


/* advance N-body system one time-step
 */

void stepsystem(unsigned ProcessId)
{
  int      i;
  int      pidx;
  real     Cavg;
  bodyptr  p;
  vector   dvel, vel1, dpos;
  unsigned nprocs;
  real     side;

  timer_clear_and_start();

#if defined(USE_CRL)
  nprocs = crl_num_nodes;
#else
  nprocs = NPROCS;
#endif

  /* NOTE: set trackstart here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local.nstep >= 2))
   *   CLOCK(trackstart);
   */

  init_root(ProcessId);

  /* start at same time
   */
#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif

  /* NOTE: set treebuildstart here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   *   CLOCK(treebuildstart);
   */

  /* load bodies into tree
   */
  maketree(ProcessId);

  /* NOTE: set treebuildend here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   * {
   *   CLOCK(treebuildend);
   *   Global->treebuildtime += treebuildend - treebuildstart;
   * }
   */

  Housekeep(ProcessId);

#if defined(USE_CRL)
  rgn_start_read(g_root);
#endif
  Cavg = (real) Cost(g_root) / (real) nprocs;

  Local.workMin = (int) (Cavg * ProcessId);
  Local.workMax = (int) (Cavg * (ProcessId + 1) + (ProcessId == (nprocs - 1)));

  /* NOTE: set partitionstart here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   *   CLOCK(partitionstart);
   */

  Local.mynbody = 0;
  find_my_bodies((nodeptr) g_root, 0, BRC_FUC, ProcessId);

#if defined(USE_CRL)
  rgn_end_read(g_root);
#endif

  /* NOTE: set partitionend here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   * {
   *   CLOCK(partitionend);
   *   Global->partitiontime += partitionend - partitionstart;
   * }
   */

  /* NOTE: set forcecalcstart here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   *   CLOCK(forcecalcstart);
   */

  ComputeForces(ProcessId);

  /* NOTE: set forcecalcend here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   * {
   *   CLOCK(forcecalcend);
   *   Global->forcecalctime += forcecalcend - forcecalcstart;
   * }
   */

  /* advance my bodies
   */
  for (pidx=0; pidx<Local.mynbody; pidx++)
  {
#if defined(USE_CRL)
    p = (bodyptr) rgn_map(Local.mybodytab[pidx]);
    rgn_start_write(p);
#else
    p = Local.mybodytab[pidx];
#endif

    MULVS(dvel, Acc(p), dthf);              
    ADDV(vel1, Vel(p), dvel);               
    MULVS(dpos, vel1, dtime);               
    ADDV(Pos(p), Pos(p), dpos);             
    ADDV(Vel(p), vel1, dvel);               

    for (i=0; i<NDIM; i++)
    {
      if (Pos(p)[i] < Local.min[i])
	Local.min[i] = Pos(p)[i];

      if (Pos(p)[i] > Local.max[i])
	Local.max[i] = Pos(p)[i];
    }

#if defined(USE_CRL)
    rgn_end_write(p);
    rgn_unmap(p);
#endif
  }

  /* compute global min and max
   */
  for (i=0; i<NDIM; i++)
  {
#if defined(USE_CRL)
    Local.min[i] = rgn_reduce_dmin(Local.min[i]);
    Local.max[i] = rgn_reduce_dmax(Local.max[i]);
#else
    Local.min[i] = shm_reduce_dmin(Local.min[i]);
    Local.max[i] = shm_reduce_dmax(Local.max[i]);
#endif    
  }

  /* NOTE: set trackend here [only processor 0?]
   *
   * if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2))
   * {
   *   CLOCK(trackend);
   *   Global->tracktime += trackend - trackstart;
   * }
   */

  SUBV(Local.max, Local.max, Local.min);

  side = 0;
  for (i=0; i<NDIM; i++)
    if (side < Local.max[i])
      side = Local.max[i];

  ADDVS(rmin, Local.min, -side/100000.0);
  rsize = 1.00002 * side;
  SETVS(Local.min,  1E99);
  SETVS(Local.max, -1E99);

  Local.nstep += 1;
  Local.tnow  += dtime;

  timer_stop_and_print();
}


/* reinitialize the global root at each time step
 */

void init_root(unsigned ProcessId)
{
  int   i;
#if defined(USE_CRL)
  rid_t root_rid;
#endif

  if (g_root == NULL)
  {
    if (ProcessId == 0)
    {
#if defined(USE_CRL)
      root_rid = Local.ctab[0];
      rgn_bcast_send(sizeof(rid_t), &root_rid);
      g_root = (cellptr) rgn_map(root_rid);
#else
      g_root = &(Local.ctab[0]);
      shm_bcast_send_p((void *) g_root);
#endif      
    }
    else
    {
#if defined(USE_CRL)
      rgn_bcast_recv(sizeof(rid_t), &root_rid);
      g_root = (cellptr) rgn_map(root_rid);
#else
      g_root = (cellptr) shm_bcast_recv_p();
#endif      
    }

  }

  if (ProcessId == 0)
  {
#if defined(USE_CRL)
    rgn_start_write(g_root);
#endif

    Type(g_root)  = CELL;
    Done(g_root)  = FALSE;
    Level(g_root) = IMAX >> 1;

    for (i=0; i<NSUB; i++)
      Subp(g_root)[i] = NULL;

#if defined(USE_CRL)
    rgn_end_write(g_root);
#endif

    Local.mynumcell = 1;
  }
  else
  {
    Local.mynumcell = 0;
  }

  /* the SPLASH-2 version of barnes only resets mynumleaf on
   * processors other than 0; presumably this is a bug?
   */
  Local.mynumleaf = 0;
}


void ComputeForces(unsigned ProcessId)
{
  int     pidx;
  bodyptr p;
  vector  acc1, dacc, dvel;

  for (pidx=0; pidx<Local.mynbody; pidx++)
  {
#if defined(USE_CRL)
    p = (bodyptr) rgn_map(Local.mybodytab[pidx]);
    rgn_start_write(p);
#else
    p = Local.mybodytab[pidx];
#endif

    SETV(acc1, Acc(p));
    Cost(p) = 0;

    hackgrav(p, ProcessId);

    Local.myn2bcalc += Local.myn2bterm; 
    Local.mynbccalc += Local.mynbcterm;

    /* if we missed self-int, count another goofup
     */
    if (!Local.skipself)
      Local.myselfint += 1;

    if (Local.nstep > 0)
    {
      /* use change in accel to make 2nd order correction to vel
       */
      SUBV(dacc, Acc(p), acc1);
      MULVS(dvel, dacc, dthf);
      ADDV(Vel(p), Vel(p), dvel);
    }

#if defined(USE_CRL)
    rgn_end_write(p);
    rgn_unmap(p);
#endif
  }
}


/* put initial list of bodies assigned to the processor into mybodytab
 */

#if defined(USE_CRL)
void find_my_initial_bodies(rid_t *btab, int nbody, unsigned ProcessId)
#else
void find_my_initial_bodies(bodyptr *btab, int nbody, unsigned ProcessId)
#endif
{
  int nprocs;
  int extra,offset,i;

#if defined(USE_CRL)
  nprocs = crl_num_nodes;
#else
  nprocs = NPROCS;
#endif

  Local.mynbody = nbody / nprocs;
  extra         = nbody % nprocs;

  if (ProcessId < extra)
  {
    Local.mynbody += 1;
    offset = Local.mynbody * ProcessId;
  }
  else if (ProcessId >= extra)
  {
    offset = Local.mynbody * ProcessId + extra;
  }

  for (i=0; i<Local.mynbody; i++)
    Local.mybodytab[i] = btab[offset+i];

#if defined(USE_CRL)
  rgn_barrier();
#else
  shm_barrier();
#endif
}


/* (mycell is assumed to be the address of a mapped region with a read
 * operation in progress)
 */

void find_my_bodies(nodeptr mycell, int work, int direction, unsigned ProcessId)
{
  int     i;
  leafptr l;
#if defined(USE_CRL)
  rid_t   q_tmp;
#else
  nodeptr q_tmp;
#endif
  nodeptr qptr;
  bodyptr p;

  if (Type(mycell) == LEAF)
  {
    l = (leafptr) mycell;

    for (i=0; i<l->num_bodies; i++)
    {
      if (work >= (Local.workMin - 0.1))
      {
	if ((Local.mynbody+2) > maxmybody)
	  error("find_my_bodies: Proc %d needs more than %d bodies; increase fleaves\n", ProcessId, maxmybody);

	Local.mybodytab[Local.mynbody++] = Bodyp(l)[i];
      }

#if defined(USE_CRL)
      p = (bodyptr) rgn_map(Bodyp(l)[i]);
      rgn_start_read(p);
#else
      p = Bodyp(l)[i];
#endif

      work += Cost(p);

#if defined(USE_CRL)
      rgn_end_read(p);
      rgn_unmap(p);
#endif

      if (work >= (Local.workMax - 0.1)) break;
    }
  }
  else
  {
    for (i=0; i<NSUB; i++)
    {
      if (work >= (Local.workMax - 0.1)) break;

      q_tmp = Subp(mycell)[Child_Sequence[direction][i]];
      if (q_tmp != 0)
      {
#if defined(USE_CRL)
	qptr = (nodeptr) rgn_map(q_tmp);
	rgn_start_read(qptr);
#else
	qptr = q_tmp;
#endif

	if ((work + Cost(qptr)) >= (Local.workMin - 0.1))
	{
	  find_my_bodies(qptr, work, Direction_Sequence[direction][i],
			 ProcessId);
	}

	work += Cost(qptr);

#if defined(USE_CRL)
	rgn_end_read(qptr);
	rgn_unmap(qptr);
#endif
      }
    }
  }
}


/* used to reinitialize different (global) variables
 * between each time step
 */

void Housekeep(unsigned ProcessId)
{
  Local.myn2bcalc = 0;
  Local.mynbccalc = 0;
  Local.myselfint = 0;
  SETVS(Local.min,  1E99);
  SETVS(Local.max, -1E99);
}


/* generate Plummer model initial conditions for test runs, scaled to
 * units such that M = -4E = G = 1 (Henon, Hegge, etc). See Aarseth,
 * SJ, Henon, M, & Wielen, R (1974) Astr & Ap, 37, 183.
 * [only run on processor 0]
 */

#define MFRAC (0.999)		/* mass cut off at MFRAC of total */

void testdata(void)
{
  int     i;
  int     pidx;
  int     rejects;
  int     halfnbody;
  real    rsc, vsc;
  real    r, v, x, y;
  vector  cmr, cmv;
  bodyptr p;
  float   offset;
  bodyptr cp;

#if defined(USE_CRL)
  assert(crl_self_addr == 0);
#else
  assert(MY_PID == 0);
#endif

  rejects = 0;
  rsc     = 9 * PI / 16;
  vsc     = sqrt(1.0 / rsc);

  CLRV(cmr);
  CLRV(cmv);

  halfnbody = nbody / 2;
  if (nbody % 2 != 0) halfnbody++;

  for (pidx=0; pidx<halfnbody; pidx++)
  {
#if defined(USE_CRL)
    p = (bodyptr) rgn_map(bodytab[pidx]);
    rgn_start_write(p);
#else
    p = bodytab[pidx];
#endif

    Type(p) = BODY;
    Mass(p) = 1.0 / nbody;
    Cost(p) = 1;

    r = 1 / sqrt(pow(xrand(0.0, MFRAC), -2.0/3.0) - 1);

    /* reject radii greater than 10
     */
    while (r > 9.0)
    {
      rejects++;
      r = 1 / sqrt(pow(xrand(0.0, MFRAC), -2.0/3.0) - 1);
    }        

    pickshell(Pos(p), rsc * r);
    ADDV(cmr, cmr, Pos(p));

    do
    {
      x = xrand(0.0, 1.0);
      y = xrand(0.0, 0.1);
    } while (y > x*x * pow(1 - x*x, 3.5));

    v = sqrt(2.0) * x / pow(1 + r*r, 0.25);
    pickshell(Vel(p), vsc * v);
    ADDV(cmv, cmv, Vel(p));

#if defined(USE_CRL)
    rgn_end_write(p);
    rgn_unmap(p);
#endif
  }

  offset = 4.0;

  for (pidx=halfnbody; pidx<nbody; pidx++)
  {
#if defined(USE_CRL)
    p = (bodyptr) rgn_map(bodytab[pidx]);
    rgn_start_write(p);
#else
    p = bodytab[pidx];
#endif

    Type(p) = BODY;
    Mass(p) = 1.0 / nbody;
    Cost(p) = 1;

#if defined(USE_CRL)
    cp = (bodyptr) rgn_map(bodytab[pidx-halfnbody]);
    rgn_start_read(cp);
#else
    cp = bodytab[pidx-halfnbody];
#endif

    for (i=0; i<NDIM; i++)
    {
      Pos(p)[i] = Pos(cp)[i] + offset; 
      ADDV(cmr, cmr, Pos(p));
      Vel(p)[i] = Vel(cp)[i];
      ADDV(cmv, cmv, Vel(p));
    }

#if defined(USE_CRL)
    rgn_end_read(cp);
    rgn_unmap(cp);
    rgn_end_write(p);
    rgn_unmap(p);
#endif
  }

  DIVVS(cmr, cmr, (real) nbody);
  DIVVS(cmv, cmv, (real) nbody);

  for (pidx=0; pidx<nbody; pidx++)
  {
#if defined(USE_CRL)
    p = (bodyptr) rgn_map(bodytab[pidx]);
    rgn_start_write(p);
#else
    p = bodytab[pidx];
#endif

    SUBV(Pos(p), Pos(p), cmr);
    SUBV(Vel(p), Vel(p), cmv);

#if defined(USE_CRL)
    rgn_end_write(p);
    rgn_unmap(p);
#endif
  }
}


/* pick a random point on a sphere of specified radius
 */

void pickshell(real vec[], real rad)
{
  int    k;
  double rsq, rsc;

  do
  {
    for (k=0; k<NDIM; k++)
      vec[k] = xrand(-1.0, 1.0);
    DOTVP(rsq, vec, vec);
  } while (rsq > 1.0);

  rsc = rad / sqrt(rsq);
  MULVS(vec, vec, rsc);
}


/* Compute the initial size of the root of the tree; only done before
 * first time step
 */
void setbound(void)
{
  int     i;
  int     pidx;
  real    side;
  bodyptr p;

  SETVS(Local.min,  1E99);
  SETVS(Local.max, -1E99);

  /* compute min and max for my bodies
   */
  for (pidx=0; pidx<Local.mynbody; pidx++)
  {
#if defined(USE_CRL)
    p = (bodyptr) rgn_map(Local.mybodytab[pidx]);
    rgn_start_read(p);
#else
    p = Local.mybodytab[pidx];
#endif

    for (i=0; i<NDIM; i++)
    {
      if (Pos(p)[i] < Local.min[i])
	Local.min[i] = Pos(p)[i];

      if (Pos(p)[i] > Local.max[i])
	Local.max[i] = Pos(p)[i];
    }

#if defined(USE_CRL)
    rgn_end_read(p);
    rgn_unmap(p);
#endif
  }

  /* compute global min and max
   */
  for (i=0; i<NDIM; i++)
  {
#if defined(USE_CRL)
    Local.min[i] = rgn_reduce_dmin(Local.min[i]);
    Local.max[i] = rgn_reduce_dmax(Local.max[i]);
#else
    Local.min[i] = shm_reduce_dmin(Local.min[i]);
    Local.max[i] = shm_reduce_dmax(Local.max[i]);
#endif
  }

  SUBV(Local.max, Local.max, Local.min);

  side = 0;
  for (i=0; i<NDIM; i++)
    if (side < Local.max[i])
      side = Local.max[i];

  ADDVS(rmin, Local.min, -side/100000.0);
  rsize = 1.00002 * side;

  SETVS(Local.min,  1E99);
  SETVS(Local.max, -1E99);
}


#if !defined(USE_CRL)

/* support for global synchronization under shared memory
 */

#define HackMaxProcs (128)

static double _shm_dmin(double, double);
static double _shm_dmax(double, double);

shared void      *shm_global_tmp[HackMaxProcs];
static red_tree_p shm_global_sync;

/* sequential part of shm_sync_init
 * [only run on processor 0]
 */
void shm_sync_seq_init(void)
{
  int        i;
  int        self;
  int        nprocs;
  red_tree_p sync;

  self   = MY_PID;
  nprocs = NPROCS;

  assert(self == 0);
  assert(nprocs < HackMaxProcs);
  sync = make_global_reduction_tree();

  for (i=0; i<nprocs; i++)
    shm_global_tmp[i] = (void *) sync;
}


/* parallel part of shm_sync_init
 * [run on all processors]
 */
void shm_sync_par_init(void)
{
  int self;

  self = MY_PID;
  shm_global_sync = (red_tree_p) shm_global_tmp[self];
  assert(shm_global_sync != NULL);
}


void shm_barrier(void)
{
  SM_TREE_BARRIER(MY_PID, shm_global_sync);
}


void shm_bcast_send_p(void *val)
{
  int i;
  int nprocs;

  nprocs = NPROCS;
  for (i=0; i<nprocs; i++)
    shm_global_tmp[i] = val;

  SM_TREE_BARRIER(MY_PID, shm_global_sync);
  SM_TREE_BARRIER(MY_PID, shm_global_sync);
}


void *shm_bcast_recv_p(void)
{
  int   self;
  void *rslt;

  self = MY_PID;
  SM_TREE_BARRIER(self, shm_global_sync);
  rslt = shm_global_tmp[self];
  SM_TREE_BARRIER(self, shm_global_sync);

  return rslt;
}


double shm_reduce_dmin(double val)
{
  double rslt;

  rslt = REDUCE(MY_PID, shm_global_sync, _shm_dmin, val);

  return rslt;
}


double shm_reduce_dmax(double val)
{
  double rslt;

  rslt = REDUCE(MY_PID, shm_global_sync, _shm_dmax, val);

  return rslt;
}


double shm_reduce_dadd(double val)
{
  double rslt;

  rslt = REDUCE_ADD(MY_PID, shm_global_sync, val);

  return rslt;
}


static double _shm_dmin(double a, double b)
{
  double rslt;

  rslt = (a < b) ? a : b;

  return rslt;
}


static double _shm_dmax(double a, double b)
{
  double rslt;

  rslt = (a > b) ? a : b;

  return rslt;
}

#endif
