/*************************************************************************/
/*                                                                       */
/*  Copyright (c) 1994 Stanford University                               */
/*                                                                       */
/*  All rights reserved.                                                 */
/*                                                                       */
/*  Permission is given to use, copy, and modify this software for any   */
/*  non-commercial purpose as long as this copyright notice is not       */
/*  removed.  All other uses, including redistribution in whole or in    */
/*  part, are forbidden without prior written permission.                */
/*                                                                       */
/*  This software is provided with absolutely no warranty and no         */
/*  support.                                                             */
/*                                                                       */
/* --------------------------------------------------------------------- */
/*                                                                       */
/*  Modifications of the original Barnes-Hut code (as taken from         */
/*  Stanford's SPLASH-2 distribution) to allow use on Alewife and        */
/*  with CRL are copyright:                                              */
/*                                                                       */
/*  Copyright (C) 1995 Massachusetts Institute of Technology             */
/*                                                                       */
/*************************************************************************/

/*  Usage:   water < infile,
    where infile has 10 fields which can be described in order as 
    follows:
    
    TSTEP:   the physical time interval (in sec) between timesteps.  
    Good default is 1e-15.
    NMOL:    the number of molecules to be simulated.
    NSTEP:   the number of timesteps to be simulated.
    NORDER:  the order of the predictor-corrector method to be used.
    set this to 6.
    NSAVE:   the frequency with which to save data in data collection.
    Set to 0 always.
    NRST:    the frequency with which to write RST file: set to 0 always (not used). 
    NPRINT:  the frequency with which to compute potential energy. 
    i.e. the routine POTENG is called every NPRINT timesteps.
    It also computes intermolecular as well as intramolecular
    interactions, and hence is very expensive.
    NFMC:    Not used (historical artifact).  Set to anything, say 0.
    NumProcs: the number of processors to be used. 
       ****ignored if there's an argument (argv[1]) ****
    CUTOFF:  the cutoff radius to be used (in Angstrom, 
    floating-point).  In a real simulation, this
    will be set to 0 here in which case the program will 
    compute it itself (and set it to about 11 Angstrom.   
    It can be set by the user if they want
    to use an artificially small cutoff radius, for example
    to control the number of boxes created for small problems 
    (and not have fewer boxes than processors).  
    */

#include "top.h"
#include "stdio.h"
#include "split.h"
#include <math.h>

/* alewife cpp seems to be leaving "unix" #define-d, so #undef it
 */
#if defined(ALEWIFE) && defined(unix)
#undef unix
#endif

#if defined(TCPUNIX) || defined(NULLCRL)
#include <sys/time.h>
#endif

/*  include files for declarations  */
#define extern
#include "parameters.h"
#include "mdvar.h"
#include "water.h"
#include "wwpot.h"
#include "cnst.h"
#include "mddata.h"
#include "fileio.h"
#include "frcnst.h"
#include "global.h"
#undef extern

int NSTEP, NSAVE, NRST, NPRINT,NFMC;
int NORD1;
int II;                         /*  variables explained in common.h */
int i;
int NDATA;
int   NFRST=11;
int  NFSV=10;
int  LKT=0;

int ProcID;                     /* process id; one per process */
int StartMol[MAXPROCS+1];       /* number of the first molecule
                                   to be handled by this process; used
                                   for static scheduling     */ 
int MolsPerProc;                /* number of mols per processor */ 
int NumProcs;                   /* number of processors being used; 
                                   run-time input           */

void main_continuation(int);

#if defined(TCPUNIX)
extern char *GROUP;
main2(argc, argv)
#else
main(argc, argv)
#endif
  char **argv;
{
    int mol, func, dir, atom;
    double XTT, MDMAIN();
    unsigned ProcID = 0;
    
#if defined(CM5)
    CMMD_fset_io_mode(stdout, CMMD_independent);
    CMMD_fset_io_mode(stderr, CMMD_independent);
#endif

#if !defined(USE_CRL)
    shm_sync_seq_init();
#endif

    if (argc != 2)
    {
#if defined(CM5)
      if (CMMD_self_address() == 0)
	fprintf(stderr, "usage: %s <nprocs>\n", argv[0]);
#else
      fprintf(stderr, "usage: %s <nprocs>\n", argv[0]);
#endif
      exit(1);
    }

    sscanf(argv[1], "%d", &NumProcs);

#if defined(CM5)
    assert(NumProcs > 0);
    assert(NumProcs <= CMMD_partition_size());
    CMMD_reset_partition_size(NumProcs);
#endif

#if defined(ALEWIFE)
    do_in_parallel(main_continuation, NumProcs);
#else
    main_continuation(NumProcs);
#endif

    return 0;
} /* main.c */



void main_continuation(int NumProcsRequested)
{
    int mol, func, dir, atom;
    double XTT, MDMAIN();
    FILE *ins;

#if !defined(USE_CRL)
    shm_sync_par_init();
#endif

#if defined(CM5)
    CMMD_sync_with_nodes();
    CMMD_reset_partition_size(NumProcsRequested);
    if(ProcID>=CMMD_partition_size()) {
	exit(0);
    }
#endif

#if defined(USE_CRL)
#if defined(TCPUNIX)
    crl_init(GROUP);
#else
    crl_init();
#endif
    assert(crl_num_nodes == NumProcsRequested);
    ProcID = crl_self_addr;
#elif defined(ALEWIFE)
    assert(NPROCS == NumProcsRequested);
    ProcID = MY_PID;
#else
    ProcID = 0;
#endif

    /* default values for the control parameters of the driver */
    /* are in parameters.h */

    /*  POSSIBLE ENHANCEMENT:  Here's where one might bind the main process 
        (process 0) to a processor if one wanted to. Others can be bound in 
        the WorkStart routine.
        */
    
    six = stdout;   /* output file */
    
    TEMP  =298.0;
    RHO   =0.9980;
    CUTOFF=0.0;
    
    /* read input */
    ins = fopen(INPUT_FILE, "r");
    if (ins == NULL)
    {
      fprintf(stderr, "%s:%d unable to open INPUT_FILE (%s)\n",
	      __FILE__, __LINE__, INPUT_FILE);
      fflush(stderr);
      exit(-1);
    }

    /* as of 1 march 95, using a single big fscanf() to read all 10
     * values from ins in one fell swoop did not work correctly on
     * Alewife -- cutoff would get the value 6.000000 instead of the
     * desired 6.212752. resorting to individual fscanf()s for each
     * value seems to solve the problem until fscanf is fixed. 
     */
    if ((fscanf(ins, "%lf", &TSTEP)   != 1) ||
	(fscanf(ins, "%d", &NMOL)     != 1) ||
	(fscanf(ins, "%d", &NSTEP)    != 1) ||
	(fscanf(ins, "%d", &NORDER)   != 1) ||
	(fscanf(ins, "%d", &NSAVE)    != 1) ||
	(fscanf(ins, "%d", &NRST)     != 1) ||
	(fscanf(ins, "%d", &NPRINT)   != 1) ||
	(fscanf(ins, "%d", &NFMC)     != 1) ||
	(fscanf(ins, "%d", &NumProcs) != 1) || /* ignored */
	(fscanf(ins, "%lf", &CUTOFF)  != 1))
    {
        fprintf(stderr, "ERROR: Usage: water < infile, which must have 10 fields, see SPLASH documentation or comment at top of water.C\n");
	assert(0);
    }

    /* ignore value of NumProcs provided in INPUT_FILE
     */
    NumProcs = NumProcsRequested;

    if (ProcID == 0) {
	printf("Using %d procs on %d steps of %d mols\n", NumProcs, NSTEP, NMOL);
	printf("Other parameters:\n\tTSTEP = %8.2e\n\tNORDER = %d\n\tNSAVE = %d\n",TSTEP,NORDER,NSAVE);
	printf("\tNRST = %d\n\tNPRINT = %d\n\tNFMC = %d\n\tCUTOFF = %lf\n\n",NRST,NPRINT,NFMC,CUTOFF);
    }
    
    /* SET UP SCALING FACTORS AND CONSTANTS */
    
    NORD1=NORDER+1;
    
    CNSTNT(NORD1,TLC);  /* sub. call to set up constants */

    { /* Do memory initializations */
        int            pid, mol;
	molecule_type *new_dat;
#if defined(USE_CRL)
	rid_t          new_rid;
#endif
        
        /*  POSSIBLE ENHANCEMENT:  One might bind the first process to 
            a processor here, even before the other (child) processes are 
            bound later in mdmain(). 
            */
        
        /* set up control for static scheduling */

	/* THIS NEEDS FIXING: note that the following code can lead to
	 * a pretty significant load imbalance if NumProcs doesn't
	 * divide NMOL evenly (e.g. 343 mos / 32 procs = 10 mols per
	 * proc on the first 31 processors and 33 mols on the last one!
	 */
        MolsPerProc = NMOL / NumProcs;
        StartMol[0] = 0;
        for (pid = 1; pid < NumProcs; pid += 1) {
            StartMol[pid] = StartMol[pid-1] + MolsPerProc;
        }
        StartMol[NumProcs] = NMOL;

        /* allocate space for main (VAR) data structure as well as
           synchronization variables */
        
        /*  POSSIBLE ENHANCEMENT: One might want to allocate a process's
            portion of the VAR array and what it points to in its local 
            memory */

#if defined(USE_CRL)
	VAR_rids = (rid_t *) safe_malloc(sizeof(rid_t) * NMOL);
#endif
	VAR = (molecule_type **) safe_malloc(sizeof(molecule_type *) * NMOL);

	/* allocate regions
	 */
	for (mol=0; mol<NMOL; mol++)
	{
	  if ((mol >= StartMol[ProcID]) && (mol < StartMol[ProcID+1]))
	  {
#if defined(USE_CRL)
	    new_rid = rgn_create(sizeof(molecule_type));
	    VAR_rids[mol] = new_rid;

	    new_dat = (molecule_type *) rgn_map(new_rid);
	    rgn_start_write(new_dat);
	    bzero((char *) new_dat, sizeof(molecule_type));
	    rgn_end_write(new_dat);

	    /* don't call rgn_unmap() here so molecule regions stay
	     * mapped on their home node (mapping addresses get stored
	     * in VAR[]; see below)
	     */

	    rgn_bcast_send(sizeof(rid_t), &new_rid);
#else
	    new_dat  = (molecule_type *) safe_malloc(sizeof(molecule_type));
	    init_lock(&(new_dat->lock));

	    shm_bcast_send_p((void *) new_dat);
#endif

	    VAR[mol] = new_dat;
	  }
	  else
	  {
#if defined(USE_CRL)
	    rgn_bcast_recv(sizeof(rid_t), &new_rid);
	    VAR_rids[mol] = new_rid;

	    /* keep all molecules mapped all the time
	     */
	    VAR[mol] = rgn_map(new_rid);
#else
	    VAR[mol] = (molecule_type *) shm_bcast_recv_p();
#endif
	  }
	}

	/* allocate the rest of global memory
	 */
	if (ProcID == 0)
	{
#if defined(USE_REDUCTION)
	  SUM_g = (SUM_struct *) safe_malloc(sizeof(SUM_struct));
	  POT_g = (POT_struct *) safe_malloc(sizeof(POT_struct));
	  VIR_g = (VIR_struct *) safe_malloc(sizeof(VIR_struct));
#elif defined(USE_CRL)
	  SUM_rid = rgn_create(sizeof(SUM_struct));
	  POT_rid = rgn_create(sizeof(POT_struct));
	  VIR_rid = rgn_create(sizeof(VIR_struct));
	  rgn_bcast_send(sizeof(rid_t), &SUM_rid);
	  rgn_bcast_send(sizeof(rid_t), &POT_rid);
	  rgn_bcast_send(sizeof(rid_t), &VIR_rid);
#else
	  SUM_g = (SUM_struct *) safe_malloc(sizeof(SUM_struct));
	  POT_g = (POT_struct *) safe_malloc(sizeof(POT_struct));
	  VIR_g = (VIR_struct *) safe_malloc(sizeof(VIR_struct));
	  init_lock(&(SUM_g->lock));
	  init_lock(&(POT_g->lock));
	  init_lock(&(VIR_g->lock));
	  shm_bcast_send_p((void *) SUM_g);
	  shm_bcast_send_p((void *) POT_g);
	  shm_bcast_send_p((void *) VIR_g);
#endif
	}
	else
	{
#if defined(USE_REDUCTION)
	  SUM_g = (SUM_struct *) safe_malloc(sizeof(SUM_struct));
	  POT_g = (POT_struct *) safe_malloc(sizeof(POT_struct));
	  VIR_g = (VIR_struct *) safe_malloc(sizeof(VIR_struct));
#elif defined(USE_CRL)
	  rgn_bcast_recv(sizeof(rid_t), &SUM_rid);
	  rgn_bcast_recv(sizeof(rid_t), &POT_rid);
	  rgn_bcast_recv(sizeof(rid_t), &VIR_rid);
#else
	  SUM_g = (SUM_struct *) shm_bcast_recv_p();
	  POT_g = (POT_struct *) shm_bcast_recv_p();
	  VIR_g = (VIR_struct *) shm_bcast_recv_p();
#endif
	}

#if defined(USE_CRL) && !defined(USE_REDUCTION)
	/* keep SUM, POT, and VIR structures permanently mapped
	 */
	SUM_g = (SUM_struct *) rgn_map(SUM_rid);
	POT_g = (POT_struct *) rgn_map(POT_rid);
	VIR_g = (VIR_struct *) rgn_map(VIR_rid);
#endif

        /*  POSSIBLE ENHANCEMENT: One might want to allocate  process i's
            PFORCES[i] array in its local memory */

        { int j,k;
          
	  PFORCES = (double ***) safe_malloc(NMOL * sizeof (double **));
	  for (j = 0; j < NMOL; j++) {
	      PFORCES[j] = (double **) safe_malloc(NDIR * sizeof (double *));
	      for (k = 0; k < NDIR; k++) {
		  PFORCES[j][k] = (double *) safe_malloc(NATOM * sizeof (double));
	      }
          }
      }
        
    }
    
    SYSCNS();    /* sub. call to initialize system constants  */
    
    if(ProcID == 0) {
	fprintf(six,"\nTEMPERATURE                = %8.2f K\n",TEMP);
	fprintf(six,"DENSITY                    = %8.5f G/C.C.\n",RHO);
	fprintf(six,"NUMBER OF MOLECULES        = %8d\n",NMOL);
	fprintf(six,"NUMBER OF PROCESSORS       = %8d\n",NumProcs);
	fprintf(six,"TIME STEP                  = %8.2e SEC\n",TSTEP);
	fprintf(six,"ORDER USED TO SOLVE F=MA   = %8d \n",NORDER);
	fprintf(six,"NO. OF TIME STEPS          = %8d \n",NSTEP);
	fprintf(six,"FREQUENCY OF DATA SAVING   = %8d \n",NSAVE);
	fprintf(six,"FREQUENCY TO WRITE RST FILE= %8d \n",NRST);
	fprintf(six,"SPHERICAL CUTOFF RADIUS    = %8.4f ANGSTROM\n",CUTOFF);
	fflush(six);  /* KERR - may not work */
    }
    
    /* initialization routine; also reads displacements and
       sets up random velocities*/
    if (ProcID == 0) {
	INITIA();
    }

#if defined(USE_CRL)
    rgn_barrier();
#else
    shm_barrier();
#endif
    
    if (ProcID != 0) {
	MDMAIN(NFSV,NFRST,NSTEP,NRST,NPRINT,NSAVE,LKT,NORD1,ProcID);
    }
    else {
	if (NSAVE > 0)  /* not true for input decks provided */
	    fprintf(six,"COLLECTING X AND V DATA AT EVERY %4d TIME STEPS \n",NSAVE);
    
	XTT = MDMAIN(NFSV,NFRST,NSTEP,NRST,NPRINT,NSAVE,LKT,NORD1,0); 
    
	printf("\nExited Happily with XTT = %g (note: XTT value is garbage if NPRINT > NSTEP)\n", XTT);
    
    }
} /* main.c */


#if defined(ALEWIFE) || defined(TCPUNIX) || defined(NULLCRL)
static double _timer_start;
#endif

void timer_clear_and_start(void)
{
#if defined(CM5)
  CMMD_node_timer_clear(0);
  CMMD_sync_with_nodes();
  CMMD_node_timer_start(0);
#elif defined(ALEWIFE)
#if defined(USE_CRL)
  crl_prof_reset();
#endif
  mp_spin_barrier();
  _timer_start = get_time();
#else
  struct timeval tp;
  gettimeofday(&tp, NULL);
  _timer_start = tp.tv_sec + (tp.tv_usec * 1e-6);
#endif
}

void timer_stop_and_print(void)
{
  int    self;
  int    nprocs;
  double t1;
  double t2;
  double avg;
  double std;

#if defined(CM5)
  CMMD_node_timer_stop(0);
  t1 = CMMD_node_timer_elapsed(0); /* seconds */
#elif defined(ALEWIFE)
  t1 = get_time();
  t1 = (t1 - _timer_start) * 1e-6; /* Mcycles */
#else
  struct timeval tp;
  gettimeofday(&tp, NULL);
  t1  = tp.tv_sec + (tp.tv_usec * 1e-6);
  t1 -= _timer_start;		/* seconds */
#endif

#if defined(USE_CRL)
  self   = crl_self_addr;
  nprocs = crl_num_nodes;
#else
  self   = MY_PID;
  nprocs = NPROCS; 
#endif

  t2 = t1 * t1;
#if defined(USE_CRL)
  t1 = rgn_reduce_dadd(t1);
  t2 = rgn_reduce_dadd(t2);
#else
  t1 = shm_reduce_dadd(t1);
  t2 = shm_reduce_dadd(t2);
#endif

  avg = t1 / nprocs;
  std = sqrt((t2 / nprocs) - (avg * avg));

  if (self == 0)
  {
    printf(" %.3f +- %.3f  ", avg, std);
    printf(" %u %.6f %.6f\n", nprocs, t1, t2);
  }

#if defined(ALEWIFE) && defined(USE_CRL)
  crl_prof_print();
#endif
}


#if !defined(USE_CRL)

/* support for global synchronization under shared memory
 */

#define HackMaxProcs (128)

static double _shm_dmin(double, double);
static double _shm_dmax(double, double);

shared void      *shm_global_tmp[HackMaxProcs];
static red_tree_p shm_global_sync;

/* sequential part of shm_sync_init
 * [only run on processor 0]
 */
void shm_sync_seq_init(void)
{
  int        i;
  int        self;
  int        nprocs;
  red_tree_p sync;

  self   = MY_PID;
  nprocs = NPROCS;

  assert(self == 0);
  assert(nprocs < HackMaxProcs);
  sync = make_global_reduction_tree();

  for (i=0; i<nprocs; i++)
    shm_global_tmp[i] = (void *) sync;
}


/* parallel part of shm_sync_init
 * [run on all processors]
 */
void shm_sync_par_init(void)
{
  int self;

  self = MY_PID;
  shm_global_sync = (red_tree_p) shm_global_tmp[self];
  assert(shm_global_sync != NULL);
}


void shm_barrier(void)
{
  SM_TREE_BARRIER(MY_PID, shm_global_sync);
}


void shm_bcast_send_p(void *val)
{
  int i;
  int self;
  int nprocs;

  self   = MY_PID;
  nprocs = NPROCS;
  for (i=0; i<nprocs; i++)
    shm_global_tmp[i] = val;

  SM_TREE_BARRIER(self, shm_global_sync);
  SM_TREE_BARRIER(self, shm_global_sync);
}


void *shm_bcast_recv_p(void)
{
  int   self;
  void *rslt;

  self = MY_PID;

  SM_TREE_BARRIER(self, shm_global_sync);
  rslt = shm_global_tmp[self];
  SM_TREE_BARRIER(self, shm_global_sync);

  return rslt;
}


double shm_reduce_dmin(double val)
{
  double rslt;

  rslt = REDUCE(MY_PID, shm_global_sync, _shm_dmin, val);

  return rslt;
}


double shm_reduce_dmax(double val)
{
  double rslt;

  rslt = REDUCE(MY_PID, shm_global_sync, _shm_dmax, val);

  return rslt;
}


double shm_reduce_dadd(double val)
{
  double rslt;

  rslt = REDUCE_ADD(MY_PID, shm_global_sync, val);

  return rslt;
}


static double _shm_dmin(double a, double b)
{
  double rslt;

  rslt = (a < b) ? a : b;

  return rslt;
}


static double _shm_dmax(double a, double b)
{
  double rslt;

  rslt = (a > b) ? a : b;

  return rslt;
}

#endif
