/*
 * latency.c
 * kirk johnson
 * february 1995
 *
 * Copyright (C) 1995 Massachusetts Institute of Technology
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation for any purpose is hereby granted without
 * fee, provided that the above copyright notice appear in all copies
 * and that both that copyright notice and this permission notice
 * appear in supporting documentation. The author makes no
 * representations about the suitability of this software for any
 * purpose. It is provided "as is" without express or implied
 * warranty.
 *
 * THE AUTHORS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
 * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR
 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * RCS $Id: latency.c,v 1.11 1995/08/24 08:24:11 tuna Exp $
 */

#include <stdio.h>

#if defined(CM5)
#include <cm/cmmd.h>
#define assert(x)                                  \
 do {                                              \
  if (!(x))                                        \
    CMMD_error("pn %d: failed assertion, %s:%d\n", \
	       crl_self_addr, __FILE__, __LINE__); \
 } while (0)
#elif defined(ALEWIFE)
#include <parallel.h>
#include <assert.h>
#else
#include <assert.h>
#include <sys/time.h>
#endif

#if defined(CM5)

#define TimerStart(x)       \
 do {                       \
  CMMD_node_timer_clear(0); \
  CMMD_node_timer_start(0); \
  *(x) = 0;                 \
 } while (0)

#define TimerStop(x)                       \
 do {                                      \
  CMMD_node_timer_stop(0);                 \
  *(x) = CMMD_node_timer_elapsed(0) * 1e6; \
 } while (0)

#elif defined(ALEWIFE)

#define TimerStart(x)  *(x) = get_time()
#define TimerStop(x)   *(x) = get_time()

#else

#define TimerStart(x)                    \
 do {                                    \
  struct timeval tp;                     \
  gettimeofday(&tp, NULL);               \
  *(x) = (tp.tv_sec * 1e6) + tp.tv_usec; \
 } while (0)

#define TimerStop(x)                     \
 do {                                    \
  struct timeval tp;                     \
  gettimeofday(&tp, NULL);               \
  *(x) = (tp.tv_sec * 1e6) + tp.tv_usec; \
 } while (0)

#endif

#include "crl.h"
#include "linreg.h"

#define NumRegions (64)

#define SelfProc      (0)
#define HomeProc      (1)
#define OtherProc0    (2)
#define OtherProc1    (3)
#define OtherProc2    (4)
#define OtherProc3    (5)
#define OtherProc4    (6)
#define OtherProc5    (7)
#define NumOtherProcs (6)

typedef double (*RidTestOne)(rid_t *, int);
typedef double (*RgnTestOne)(void **, int);
typedef double (*RgnTestTwo)(void **, int, int);

void   latency_worker(int);
void   rid_test_one(rid_t *, RidTestOne, char *);
void   rgn_test_one(void **, RgnTestOne, char *);
void   rgn_test_two(void **, RgnTestTwo, char *);
double test_map_miss(rid_t *, int);
double test_map_hit_a(rid_t *, int);
double test_map_hit_b(rid_t *, int);
double test_unmap_c(rid_t *, int);
double test_unmap_d(rid_t *, int);
double test_start_read_miss(void **, int, int);
double test_start_read_hit_e(void **, int);
double test_start_read_hit_f(void **, int);
double test_end_read_g(void **, int);
double test_end_read_h(void **, int);
double test_start_write_miss(void **, int, int);
double test_start_write_modify(void **, int);
double test_start_write_hit(void **, int);
double test_end_write(void **, int);
void   allocate_regions(rid_t *, int);


#if defined(TCPUNIX)
extern char *GROUP;
int main2(int argc, char *argv[])
#else
int main(int argc, char *argv[])
#endif
{
  int rgnsiz;

#if defined(CM5)
  CMMD_set_io_mode(0, CMMD_independent);
  CMMD_set_io_mode(1, CMMD_independent);
  CMMD_set_io_mode(2, CMMD_independent);
#endif

  if (argc != 2)
  {
#if defined(CM5)
    if (CMMD_self_address() == 0)
      fprintf(stderr, "usage: latency <rgnsiz>\n");
#else
    fprintf(stderr, "usage: latency <rgnsiz>\n");
#endif
    exit(1);
  }

  sscanf(argv[1], "%d", &rgnsiz);
  if (rgnsiz <= 0)
  {
#if defined(CM5)
    if (CMMD_self_address() == 0)
      fprintf(stderr, "rgnsiz must be positive\n");
#else
    fprintf(stderr, "rgnsiz must be positive\n");
#endif
    exit(1);
  }

#if defined(ALEWIFE)
  do_in_parallel(latency_worker, rgnsiz);
#else
  latency_worker(rgnsiz);
#endif

  return 0;
}


void latency_worker(int region_size)
{
  int    i;
  rid_t *rids;
  void **rgns;

#if defined(TCPUNIX)
  crl_init(GROUP);
#else
  crl_init();
#endif

  if (crl_self_addr == 0)
  {
    printf("region size is %d bytes\n", region_size);
#if defined(ALEWIFE)
    printf(" (times in cycles)\n\n");
#else
    printf(" (times in microseconds)\n\n");
#endif
  }

  rids = (rid_t *) safe_malloc(sizeof(rid_t) * NumRegions);
  rgns = (void **) safe_malloc(sizeof(void *) * NumRegions);
  assert(rids != NULL);
  assert(rgns != NULL);

  allocate_regions(rids, region_size);

  rid_test_one(rids, test_map_miss, "map (miss)");
  rid_test_one(rids, test_map_hit_a, "map (hit) [a]");
  rid_test_one(rids, test_map_hit_b, "map (hit) [b]");
  rid_test_one(rids, test_unmap_c, "unmap [c]");
  rid_test_one(rids, test_unmap_d, "unmap [d]");

  if (crl_self_addr == 0)
  {
    printf("\n");
    printf(" [a] mapping a region that is not currently mapped but is\n");
    printf("     present in the unmapped region cache\n");
    printf(" [b] mapping a region that is already mapped\n");
    printf(" [c] unmapping a region that is mapped multiple times\n");
    printf(" [d] unmapping a region that is mapped once\n");
    printf("\n");
  }

  for (i=0; i<NumRegions; i++)
    rgns[i] = rgn_map(rids[i]);

  rgn_test_two(rgns, test_start_read_miss, "start_read (miss)");
  rgn_test_one(rgns, test_start_read_hit_e, "start_read (hit) [e]");
  rgn_test_one(rgns, test_start_read_hit_f, "start_read (hit) [f]");
  rgn_test_one(rgns, test_end_read_g, "end_read [g]");
  rgn_test_one(rgns, test_end_read_h, "end_read [h]");

  if (crl_self_addr == 0)
  {
    printf("\n");
    printf(" [e] starting a read that can be satisfied locally; no\n");
    printf("     other reads already in progress\n");
    printf(" [f] starting a read that can be satisfied locally; one\n");
    printf("     other read already in progress\n");
    printf(" [g] ending a read leaving other reads in progress\n");
    printf(" [h] ending a read leaving no other reads in progress\n");
    printf("\n");
  }

  rgn_test_two(rgns, test_start_write_miss, "start_write (miss)");
  rgn_test_one(rgns, test_start_write_modify, "start_write (modify)");
  rgn_test_one(rgns, test_start_write_hit, "start_write (hit)");
  rgn_test_one(rgns, test_end_write, "end_write");

  for (i=0; i<NumRegions; i++)
    rgn_unmap(rgns[i]);

  safe_free(rids);
  safe_free(rgns);
}


void rid_test_one(rid_t *rids, RidTestOne func, char *label)
{
  int     i;
  double *x;
  double *y;
  LinReg  info;

  x = (double *) safe_malloc(sizeof(double) * NumRegions);
  y = (double *) safe_malloc(sizeof(double) * NumRegions);
  assert(x != NULL);
  assert(y != NULL);

  /* warm things up
   */
  func(rids, NumRegions);

  /* run tests
   */
  for (i=1; i<=NumRegions; i++)
  {
    x[i-1] = i;
    y[i-1] = func(rids, i);
  }

  if (crl_self_addr == SelfProc)
  {
    printf("%-22s ", label);

    /* intercept, slope, variance explained
     */
    linear_regression(NumRegions, x, y, &info);
    printf(" %8.2f +- %6.2f ", info.b0_val, info.b0_std);
    printf(" %8.2f +- %6.2f ", info.b1_val, info.b1_std);
    printf(" %.6f\n", (info.ssr / info.sst));
  }

  safe_free(x);
  safe_free(y);
}


void rgn_test_one(void **rgns, RgnTestOne func, char *label)
{
  int     i;
  double *x;
  double *y;
  LinReg  info;

  x = (double *) safe_malloc(sizeof(double) * NumRegions);
  y = (double *) safe_malloc(sizeof(double) * NumRegions);
  assert(x != NULL);
  assert(y != NULL);

  /* warm things up
   */
  func(rgns, NumRegions);

  /* run tests
   */
  for (i=1; i<=NumRegions; i++)
  {
    x[i-1] = i;
    y[i-1] = func(rgns, i);
  }

  if (crl_self_addr == SelfProc)
  {
    printf("%-22s ", label);

    /* intercept, slope, variance explained
     */
    linear_regression(NumRegions, x, y, &info);
    printf(" %8.2f +- %6.2f ", info.b0_val, info.b0_std);
    printf(" %8.2f +- %6.2f ", info.b1_val, info.b1_std);
    printf(" %.6f\n", (info.ssr / info.sst));
  }

  safe_free(x);
  safe_free(y);
}


void rgn_test_two(void **rgns, RgnTestTwo func, char *label)
{
  int     i, j;
  double *x;
  double *y;
  LinReg  info;

  x = (double *) safe_malloc(sizeof(double) * NumRegions);
  y = (double *) safe_malloc(sizeof(double) * NumRegions);
  assert(x != NULL);
  assert(y != NULL);

  /* warm things up
   */
  func(rgns, NumOtherProcs, NumRegions);

  /* run tests
   */
  for (i=0; i<=NumOtherProcs; i++)
  {
    for (j=1; j<=NumRegions; j++)
    {
      x[j-1] = j;
      y[j-1] = func(rgns, i, j);
    }

    if (crl_self_addr == SelfProc)
    {
      printf("%-20s %d ", label, i);

      /* intercept, slope, variance explained
       */
      linear_regression(NumRegions, x, y, &info);
      printf(" %8.2f +- %6.2f ", info.b0_val, info.b0_std);
      printf(" %8.2f +- %6.2f ", info.b1_val, info.b1_std);
      printf(" %.6f\n", (info.ssr / info.sst));
    }
  }

  safe_free(x);
  safe_free(y);
}


/* measure time to map regions that are unmapped and not cached in the URC
 */
double test_map_miss(rid_t *rids, int nregions)
{
  int    i;
  void **rgns;
  double t0, t1;
  double rslt;

  crl_flush_urc();
  rgn_barrier();

  if (crl_self_addr == SelfProc)
  {
    rgns = (void **) safe_malloc(sizeof(void *) * nregions);
    assert(rgns != NULL);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgns[i] = rgn_map(rids[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_unmap(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to map regions that are unmapped and cached in the URC
 */
double test_map_hit_a(rid_t *rids, int nregions)
{
  int    i;
  void **rgns;
  double t0, t1;
  double rslt;

  crl_flush_urc();
  rgn_barrier();

  if (crl_self_addr == SelfProc)
  {
    rgns = (void **) safe_malloc(sizeof(void *) * nregions);
    assert(rgns != NULL);

    for (i=0; i<nregions; i++)
      rgns[i] = rgn_map(rids[i]);

    for (i=0; i<nregions; i++)
      rgn_unmap(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgns[i] = rgn_map(rids[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_unmap(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to map regions that are already mapped
 */
double test_map_hit_b(rid_t *rids, int nregions)
{
  int    i;
  void **rgns;
  double t0, t1;
  double rslt;

  crl_flush_urc();
  rgn_barrier();

  if (crl_self_addr == SelfProc)
  {
    rgns = (void **) safe_malloc(sizeof(void *) * nregions);
    assert(rgns != NULL);

    for (i=0; i<nregions; i++)
      rgns[i] = rgn_map(rids[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgns[i] = rgn_map(rids[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
    {
      rgn_unmap(rgns[i]);
      rgn_unmap(rgns[i]);
    }

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to unmap region (leaving it mapped)
 */
double test_unmap_c(rid_t *rids, int nregions)
{
  int    i;
  void **rgns;
  double t0, t1;
  double rslt;

  crl_flush_urc();
  rgn_barrier();

  if (crl_self_addr == SelfProc)
  {
    rgns = (void **) safe_malloc(sizeof(void *) * nregions);
    assert(rgns != NULL);

    for (i=0; i<nregions; i++)
    {
      rgns[i] = rgn_map(rids[i]);
      rgn_map(rids[i]);
    }

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_unmap(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_unmap(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to unmap region (and move it into the URC)
 */
double test_unmap_d(rid_t *rids, int nregions)
{
  int    i;
  void **rgns;
  double t0, t1;
  double rslt;

  crl_flush_urc();
  rgn_barrier();

  if (crl_self_addr == SelfProc)
  {
    rgns = (void **) safe_malloc(sizeof(void *) * nregions);
    assert(rgns != NULL);

    for (i=0; i<nregions; i++)
      rgns[i] = rgn_map(rids[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_unmap(rgns[i]);
    TimerStop(&t1);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to start_read (miss)
 */
double test_start_read_miss(void **rgns, int nothers, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* flush regions back to home node
   */
  if (self == HomeProc)
  {
    for (i=0; i<nregions; i++)
    {
      tmp = rgns[i];
      rgn_start_write(tmp);
      rgn_end_write(tmp);
    }
  }

  rgn_barrier();

  /* get read copies of regions on designated other nodes
   */
  if (((self == OtherProc0) && (nothers > 0)) ||
      ((self == OtherProc1) && (nothers > 1)) ||
      ((self == OtherProc2) && (nothers > 2)) ||
      ((self == OtherProc3) && (nothers > 3)) ||
      ((self == OtherProc4) && (nothers > 4)) ||
      ((self == OtherProc5) && (nothers > 5)))
  {
    for (i=0; i<nregions; i++)
    {
      tmp = rgns[i];
      rgn_start_read(tmp);
      rgn_end_read(tmp);
    }
  }

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to start_read (hit, no reads in progress)
 */
double test_start_read_hit_e(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);

    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to start_read (hit, one read in progress)
 */
double test_start_read_hit_f(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
    {
      rgn_end_read(rgns[i]);
      rgn_end_read(rgns[i]);
    }

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to end_read (leaving one read in progress)
 */
double test_end_read_g(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
    {
      rgn_start_read(rgns[i]);
      rgn_start_read(rgns[i]);
    }

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to end_read (leaving no reads in progress)
 */
double test_end_read_h(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);
    TimerStop(&t1);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to start_write (miss)
 */
double test_start_write_miss(void **rgns, int nothers, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* flush regions back to home node
   */
  if (self == HomeProc)
  {
    for (i=0; i<nregions; i++)
    {
      tmp = rgns[i];
      rgn_start_write(tmp);
      rgn_end_write(tmp);
    }
  }

  rgn_barrier();

  /* get read copies of regions on designated other nodes
   */
  if (((self == OtherProc0) && (nothers > 0)) ||
      ((self == OtherProc1) && (nothers > 1)) ||
      ((self == OtherProc2) && (nothers > 2)) ||
      ((self == OtherProc3) && (nothers > 3)) ||
      ((self == OtherProc4) && (nothers > 4)) ||
      ((self == OtherProc5) && (nothers > 5)))
  {
    for (i=0; i<nregions; i++)
    {
      tmp = rgns[i];
      rgn_start_read(tmp);
      rgn_end_read(tmp);
    }
  }

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_start_write(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_end_write(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to start_write (modify)
 */
double test_start_write_modify(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* flush regions back to home node
   */
  if (self == HomeProc)
  {
    for (i=0; i<nregions; i++)
    {
      tmp = rgns[i];
      rgn_start_write(tmp);
      rgn_end_write(tmp);
    }
  }

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
      rgn_start_read(rgns[i]);

    for (i=0; i<nregions; i++)
      rgn_end_read(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_start_write(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_end_write(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to start_write (hit)
 */
double test_start_write_hit(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
      rgn_start_write(rgns[i]);

    for (i=0; i<nregions; i++)
      rgn_end_write(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_start_write(rgns[i]);
    TimerStop(&t1);

    for (i=0; i<nregions; i++)
      rgn_end_write(rgns[i]);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


/* measure time to end_write
 */
double test_end_write(void **rgns, int nregions)
{
  int    i;
  int    self;
  void  *tmp;
  double t0, t1;
  double rslt;

  self = crl_self_addr;

  rgn_barrier();

  /* run tests
   */
  if (self == SelfProc)
  {
    for (i=0; i<nregions; i++)
      rgn_start_write(rgns[i]);

    TimerStart(&t0);
    for (i=0; i<nregions; i++)
      rgn_end_write(rgns[i]);
    TimerStop(&t1);

    rslt = t1 - t0;
  }
  else
  {
    rslt = 0;
  }

  rgn_barrier();

  return rslt;
}


void allocate_regions(rid_t *rids, int region_size)
{
  int i;
  int nbytes;

  nbytes = sizeof(rid_t) * NumRegions;

  if (crl_self_addr == HomeProc)
  {
    for (i=0; i<NumRegions; i++)
      rids[i] = rgn_create(region_size);

    rgn_bcast_send(nbytes, rids);
  }
  else
  {
    rgn_bcast_recv(nbytes, rids);
  }
}
