/*
 * alw_sync.c
 * kirk johnson
 * january 1995
 *
 * Copyright (C) 1995 Massachusetts Institute of Technology
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation for any purpose is hereby granted without
 * fee, provided that the above copyright notice appear in all copies
 * and that both that copyright notice and this permission notice
 * appear in supporting documentation. The author makes no
 * representations about the suitability of this software for any
 * purpose. It is provided "as is" without express or implied
 * warranty.
 *
 * THE AUTHORS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
 * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR
 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * RCS $Id: alw_sync.c,v 1.4 1995/08/22 21:27:49 tuna Exp $
 */

#include <primops.h>
#include "crl_int.h"

#define BcastBufSize (1024)
#define Log2MaxProcs    (8)

typedef struct
{
  unsigned parity;
  char     flags[2][Log2MaxProcs];
} BarrierState;

typedef struct
{
  unsigned flag;
  unsigned dist;
  void    *buf;
} BroadcastState;

typedef struct
{
  unsigned parity;
  char     flags[2][Log2MaxProcs];
  double   vals[2][Log2MaxProcs];
} ReductionState;

static void _barrier_handler(void);
static void _broadcast_handler(void);
static void _reduction_handler(void);

static BarrierState   barrier_state;
static BroadcastState broadcast_state;
static ReductionState reduction_state;


void init_sync(void)
{
  bzero((char *) &broadcast_state, sizeof(BroadcastState));
  bzero((char *) &barrier_state, sizeof(BarrierState));
  bzero((char *) &reduction_state, sizeof(ReductionState));

  broadcast_state.buf = (void *) malloc(BcastBufSize);
  assert(broadcast_state.buf != NULL);

  /* we really don't want to use rgn_barrier here (subtle: what if a
   * message arrives before we zero out the barrier_state, setting a
   * flag to 1, but then we clear the flag when zeroing the barrier
   * state. alternately, we could disable message delivery while
   * zeroing the various global states ...)
   */
  mp_spin_barrier();
}


void rgn_barrier(void)
{
  unsigned      parity;
  unsigned      self;
  unsigned      nprocs;
  unsigned      stage;
  unsigned      dist;
  char         *flags;
  BarrierState *state;

  state = &barrier_state;

  if (state->parity)
  {
    parity        = 1;
    state->parity = 0;
  }
  else
  {
    parity        = 0;
    state->parity = 1;
  }

  self   = MY_PID;
  nprocs = NPROCS;
  stage  = 0;
  dist   = 1;
  flags  = state->flags[parity];

  while (dist < nprocs)
  {
    user_do_on((self^dist), _barrier_handler, parity, stage);
    stage += 1;
    dist  += dist;

    while (*((volatile char *) flags) == 0);
    *flags = 0;
    flags += 1;
  }
}


static void _barrier_handler(void)
{
  unsigned parity;
  unsigned stage;
  char    *flags;

  /* extract scalar args from head of message */
  parity = (int) ipi_in_reg(2);
  stage  = (int) ipi_in_reg(3);
  ipicst(-1, -1);

  flags = barrier_state.flags[parity];
  sanity(flags[stage] == 0);
  flags[stage] = 1;
}


void rgn_bcast_send(int nbytes, void *buf)
{
  int   i;
  int   self;
  int   dist;
  int   peer;
  int   ndwords;
  void *src_buf;

  /* be lazy for now
   */
  assert(nbytes <= BcastBufSize);

  self    = MY_PID;
  dist    = NPROCS >> 1;
  ndwords = (nbytes + 7) >> 3;
  src_buf = broadcast_state.buf;

  bcopy(buf, src_buf, nbytes);

  /* flush send buffer from memory system
   */
  for (i=((ndwords<<3)-8); i>0; i-=16)
    hardflush2(src_buf, i);
  hardflush(src_buf);

  while (dist != 0)
  {
    peer = self ^ dist;
    dist = dist >> 1;
    user_do_on_dmanofix(peer, _broadcast_handler,
			dist, ndwords,
			src_buf, ndwords);
  }

  rgn_barrier();
}


void rgn_bcast_recv(int nbytes, void *buf)
{
  int             self;
  int             dist;
  int             peer;
  int             ndwords;
  BroadcastState *state;
  void           *src_buf;

  self    = MY_PID;
  ndwords = (nbytes + 7) >> 3;
  state   = &broadcast_state;
  src_buf = state->buf;

  while (*((volatile unsigned *) &(state->flag)) == 0);
  dist = state->dist;

  /* wait until data storeback is done
   */
  wait_for_storeback();

  while (dist != 0)
  {
    peer    = self ^ dist;
    dist    = dist >> 1;
    user_do_on_dmanofix(peer, _broadcast_handler,
			dist, ndwords,
			src_buf, ndwords);
  }

  bcopy(src_buf, buf, nbytes);
  state->flag = 0;

  rgn_barrier();
}


static void _broadcast_handler(void)
{
  int             i;
  int             dist;
  int             ndwords;
  BroadcastState *state;
  void           *dst_buf;

  /* extract scalar args from head of message
   */
  dist    = (int) ipi_in_reg(2);
  ndwords = (int) ipi_in_reg(3);

  state   = &broadcast_state;
  dst_buf = state->buf;

  /* flush recv buffer from memory system
   */
  for (i=((ndwords<<3)-8); i>0; i-=16)
    hardflush2(dst_buf, i);
  hardflush(dst_buf);

  /* initiate data storeback
   */
  CReg->StoreAddr = (unsigned) dst_buf;
  ipicstnofix(-1, 2);

  sanity(state->flag == 0);
  state->flag = 1;
  state->dist = dist;
}


double rgn_reduce_dadd(double x)
{
  unsigned        w0;
  unsigned        w1;
  unsigned        parity;
  unsigned        self;
  unsigned        nprocs;
  unsigned        stage;
  unsigned        dist;
  char           *flags;
  double         *vals;
  double          tmp;
  ReductionState *state;

  w0    = ((unsigned *) &x)[0];
  w1    = ((unsigned *) &x)[1];
  state = &reduction_state;

  if (state->parity)
  {
    parity        = 1;
    state->parity = 0;
  }
  else
  {
    parity        = 0;
    state->parity = 1;
  }

  self   = MY_PID;
  nprocs = NPROCS;
  stage  = 0;
  dist   = 1;
  flags  = state->flags[parity];
  vals   = state->vals[parity];

  while (dist < nprocs)
  {
    user_do_on((self^dist), _reduction_handler, parity, stage, w0, w1);
    stage += 1;
    dist  += dist;

    while (*((volatile char *) flags) == 0);
    *flags = 0;

    tmp = *vals;
    x  += tmp;
    w0 = ((unsigned *) &x)[0];
    w1 = ((unsigned *) &x)[1];

    flags += 1;
    vals  += 1;
  }

  return x;
}


double rgn_reduce_dmin(double x)
{
  unsigned        w0;
  unsigned        w1;
  unsigned        parity;
  unsigned        self;
  unsigned        nprocs;
  unsigned        stage;
  unsigned        dist;
  char           *flags;
  double         *vals;
  double          tmp;
  ReductionState *state;

  w0    = ((unsigned *) &x)[0];
  w1    = ((unsigned *) &x)[1];
  state = &reduction_state;

  if (state->parity)
  {
    parity        = 1;
    state->parity = 0;
  }
  else
  {
    parity        = 0;
    state->parity = 1;
  }

  self   = MY_PID;
  nprocs = NPROCS;
  stage  = 0;
  dist   = 1;
  flags  = state->flags[parity];
  vals   = state->vals[parity];

  while (dist < nprocs)
  {
    user_do_on((self^dist), _reduction_handler, parity, stage, w0, w1);
    stage += 1;
    dist  += dist;

    while (*((volatile char *) flags) == 0);
    *flags = 0;

    tmp = *vals;
    if (tmp < x) x = tmp;
    w0 = ((unsigned *) &x)[0];
    w1 = ((unsigned *) &x)[1];

    flags += 1;
    vals  += 1;
  }

  return x;
}


double rgn_reduce_dmax(double x)
{
  unsigned        w0;
  unsigned        w1;
  unsigned        parity;
  unsigned        self;
  unsigned        nprocs;
  unsigned        stage;
  unsigned        dist;
  char           *flags;
  double         *vals;
  double          tmp;
  ReductionState *state;

  w0    = ((unsigned *) &x)[0];
  w1    = ((unsigned *) &x)[1];
  state = &reduction_state;

  if (state->parity)
  {
    parity        = 1;
    state->parity = 0;
  }
  else
  {
    parity        = 0;
    state->parity = 1;
  }

  self   = MY_PID;
  nprocs = NPROCS;
  stage  = 0;
  dist   = 1;
  flags  = state->flags[parity];
  vals   = state->vals[parity];

  while (dist < nprocs)
  {
    user_do_on((self^dist), _reduction_handler, parity, stage, w0, w1);
    stage += 1;
    dist  += dist;

    while (*((volatile char *) flags) == 0);
    *flags = 0;

    tmp = *vals;
    if (tmp > x) x = tmp;
    w0 = ((unsigned *) &x)[0];
    w1 = ((unsigned *) &x)[1];

    flags += 1;
    vals  += 1;
  }

  return x;
}


static void _reduction_handler(void)
{
  unsigned        parity;
  unsigned        stage;
  unsigned        w0, w1;
  char           *flags;
  unsigned       *vals;
  ReductionState *state;

  /* extract scalar args from head of message
   */
  parity = (unsigned) ipi_in_reg(2);
  stage  = (unsigned) ipi_in_reg(3);
  w0     = (unsigned) ipi_in_reg(4);
  w1     = (unsigned) ipi_in_reg(5);
  ipicst(-1, -1);

  state = &reduction_state;
  flags = state->flags[parity];
  vals  = (unsigned *) &(state->vals[parity][stage]);

  vals[0] = w0;
  vals[1] = w1;

  sanity(flags[stage] == 0);
  flags[stage] = 1;
}
