#ifndef DISTRIBUTED_MEMORY
#define DISTRIBUTED_MEMORY
#endif


/* vcid = $Id: bcp.h,v 1.1.1.1 1998/08/27 19:16:25 gropp Exp $ */

#ifndef __BCP
#define __BCP

/* Choose general types based on specific architectures */

#include "mpi.h"
extern MPI_Status __STATUS;
extern int __NUMNODES, __MYPROCID;static MPI_Status _mpi_status;static int _n, __MPILEN;
#ifdef DISTRIBUTED_MEMORY
#define PARALLEL_VERSION
#endif

#ifdef DEBUG
#define DBUG(a) {printf("%s",a);fflush(stdout);}
#else
#define DBUG(a)
#endif

#define DOUBLE
#ifdef DOUBLE
#define MY_MPI_FLOAT MPI_DOUBLE_PRECISION
#define MY_FLOAT double
#else
#define MY_MPI_FLOAT MPI_REAL
#define MY_FLOAT float
#endif

/*
   Definitions for mesh communication routines
 */

typedef struct {
    int s1, n1, n2, n3, n4, n5;      /* strides and lengths */
    int inc1, inc2, inc3, inc4;      /* increments between elements in
                                        the two leading dimensions */
    MY_FLOAT *p;                       /* data */
    void   *lctx;                    /* local context (used to compute p) */
    } slab;

struct BC_entry {
    int  type;
    slab src;                        /* Source or destination data */
    int  phase;                      /* Used to group into send/recv phases */
    int  id;                         /* id of destination */
    int  processor;                  /* partner processor OR offset of
                                        partner operation if local */
#ifdef DISTRIBUTED_MEMORY
    /* All of this data is set by the comm_compile routine */
    int  inplace;                    /* True if buffer is actually the
                                        source or destination (no copy needed)
                                        */
    MPI_Request rc;                   /* Communications status */
    int  mtype;                      /* Message type */
    int  act_len;                    /* buffer length */
    MY_FLOAT *buffer;                  /* buffer for moving data */
#endif
    };
typedef struct BC_entry BCentry;

#define COPYINC1(x,y,yinc,n) {int __i,__j=0; for(__i=0;__i<n;__i++){\
(x)[__i]=(y)[__j];__j+=yinc;}}

#define COPYINC2(x,y,xinc,n) {int __i,__j=0;for(__i=0;__i<n;__i++){\
(x)[__j]=(y)[__i];__j+=xinc;}}

#define COPYINC3(x,y,xinc,yinc,n) {int __i,__j=0,__k=0;\
for(__i=0;__i<n;__i++){(x)[__j]=(y)[__k];__j+=xinc;__k+=yinc;}}

#define GCOL(my_vec,my_vec_len,all_vec,all_vec_len,nc,procset,data_type) \
    MPI_Allgather(my_vec,my_vec_len/sizeof(data_type),data_type,\
                  all_vec,my_vec_len/sizeof(data_type),data_type,MPI_COMM_WORLD)

#define PSPartition(a,b) MPI_COMM_WORLD
#define GTOKEN(procset,i) ((i)==0)
#define MEMSET(s,c,n)   memset((char*)(s),c,n)
#define PSNUMNODES(a) __NUMNODES
#define PSMYPROCID(a) __MYPROCID
#define PIRecvIdClear(a) a = MPI_REQUEST_NULL
#define PIRecvIdNull(a) ((a) == MPI_REQUEST_NULL)
#define LOGEVENT(a) 
#define PIgsync(a) MPI_Barrier(a)
#define PIAllProcs MPI_COMM_WORLD
#define ALLPROCS MPI_COMM_WORLD
#define __ALLPROCS MPI_COMM_WORLD

#define PIGetTags(a,b) 199 

#define PIReturnTags(a,b,c) 

#define PIgdmin(vec,vec_len,work_vec,procset) \
{ \
    int i99, *iptr1 = (MY_FLOAT *) max_vec, *iptr2 = (MY_FLOAT *) work_vec; \
    for (i99=0;i99<(vec_len);i99++) { \
        (iptr2)[i99] = (iptr1)[i99]; \
    } \
    MPI_Allreduce(iptr2,iptr1,vec_len,MY_MPI_FLOAT,MPI_MIN,MPI_COMM_WORLD); \
}

#define PIgdmax(vec,vec_len,work_vec,procset) \
{ \
    int i99, *iptr1 = (MY_FLOAT *) max_vec, *iptr2 = (MY_FLOAT *) work_vec; \
    for (i99=0;i99<(vec_len);i99++) { \
        (iptr2)[i99] = (iptr1)[i99]; \
    } \
    MPI_Allreduce(iptr2,iptr1,vec_len,MY_MPI_FLOAT,MPI_MAX,MPI_COMM_WORLD); \
}

#define PIgdsum(vec,vec_len,work_vec,procset) \
{ \
    int i99, *iptr1 = (MY_FLOAT *) max_vec, *iptr2 = (MY_FLOAT *) work_vec; \
    for (i99=0;i99<(vec_len);i99++) { \
        (iptr2)[i99] = (iptr1)[i99]; \
    } \
    MPI_Allreduce(iptr2,iptr1,vec_len,MY_MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD); \
}

#define PIgimax(max_vec,vec_len,work_vec,procset) \
{ \
    int i99, *iptr1 = (int *) max_vec, *iptr2 = (int *) work_vec; \
    for (i99=0;i99<(vec_len);i99++) { \
        (iptr2)[i99] = (iptr1)[i99]; \
    } \
    MPI_Allreduce(iptr2,iptr1,vec_len,MPI_INT,MPI_MAX,MPI_COMM_WORLD); \
}

#define GISUM(sum_vec,vec_len,work_vec,procset) \
{ \
    int i99, *iptr1 = (int *) sum_vec, *iptr2 = (int *) work_vec; \
    for (i99=0;i99<(vec_len);i99++) { \
        (iptr2)[i99] = (iptr1)[i99]; \
    } \
    MPI_Allreduce(iptr2,iptr1,vec_len,MPI_INT,MPI_SUM,MPI_COMM_WORLD); \
}

#define MSGFREERECV(msg)               if(msg){free(msg);}

#define RECVWAIT(type,buffer,length,datatype,id)     \
        {MPI_Wait(&(id),&__STATUS);\
         MPI_Get_count(&__STATUS,MPI_BYTE,&__MPILEN);}

#define RECVWAITNOMEM(type,buffer,length,datatype,id)     \
    RECVWAIT(type,buffer,length,datatype,id)

#define MSGALLOCRECV( buf, buflen, data_type) \
    buf = (data_type *) malloc(buflen*sizeof(data_type)) 

#define MSGALLOCFREE( outbuf ) \
    free(outbuf)

#define SENDSYNCNOMEM(type,buf,size,to_proc,data_type) \
    MPI_Send((void *)buf,size,data_type,to_proc,type,MPI_COMM_WORLD)

#define RECVSYNCNOMEM(type,buffer,length,datatype)  \
        RECVSYNC(type,buffer,length,datatype);

#define RECVSYNC(type,buf,length,data_type) \
    MPI_Recv(buf,length,data_type,MPI_ANY_SOURCE,type,MPI_COMM_WORLD,&__STATUS)

#define RECVASYNCNOMEM(type,buffer,length,datatype,id) \
   MPI_Irecv( buffer,length,MPI_BYTE,MPI_ANY_SOURCE,type,MPI_COMM_WORLD,&(id))

#define SENDASYNCNOMEM(type,buffer,length,to,datatype,id) \
   MPI_Isend( buffer,length,MPI_BYTE,to,type,MPI_COMM_WORLD,&(id))

/* Advance to next program line.  on entry, a is current line, on exit,
   it is the next line.  Value is the NEXT line */
#define NEXTLINE(a) (++a)

/*
   The types are defined here.  They fall into three basic
   groups: local, send to another processor and receive from another
           processor
   The send and receive have a number of minor modes that are intended
   to provide for optimizations:
   
   SEND               - Contains info for setting a send buffer
   RECV               - Contains info for processing a receive buffer
   BUFFER             - This operation contains the actual buffer to be
                        sent or received
   SYNC_BLOCK         - (SEND ONLY).  Complete receives of all items
                        with phase less than this instruction.
   SYNC_NBR           - synchronize with neighbor (force a handshake;
                        this can be used to speedup long messages on
                        some machines.
   FORCETYPE          - use ForceType send (receive buffer is guarenteed
                        to exist on destination node)
   To make things easier on the user, the comm_compile() routine handles
   modifying the modes of SEND and RECV.

   In order to simplify the testing, the major modes occupy a full byte
   and could be accessed with a load-byte instruction (instead of using
   a mask).  This is not yet implemented.
 */

/* Major operations supported internally only */
#define BLOCK_COMM_LOCAL_SRC      0x004
#define BLOCK_COMM_LOCAL_DEST     0x008

/* Minor modes only */
#define MAJOR_MODE_MASK       0x0FF
#define BLOCK_COMM_BUFFER     0x100
#define BLOCK_COMM_SYNC_BLOCK 0x200
#define BLOCK_COMM_SYNC_NBR   0x400
#define BLOCK_COMM_FORCETYPE  0x800

/* Macros to test modes */
#define IS_SRC(a)   (((a)->type & MAJOR_MODE_MASK) == BLOCK_COMM_SRC)
#define IS_DEST(a)  (((a)->type & MAJOR_MODE_MASK) == BLOCK_COMM_DEST)
#define IS_LOCAL_SRC(a) (((a)->type & MAJOR_MODE_MASK) == BLOCK_COMM_LOCAL_SRC)
#define IS_LOCAL_DEST(a) (((a)->type & MAJOR_MODE_MASK) == BLOCK_COMM_LOCAL_DEST)
#define IS_LOCAL(a) ( (a)->type & 0x00C )
#define SET_MAJOR_MODE(a,b) (a)->type = ((a)->type & ~MAJOR_MODE_MASK) | (b)
#define GET_MAJOR_MODE(a)   ((a)->type & MAJOR_MODE_MASK)

/* Type used in validating program on distributed memory machines */
#ifdef DISTRIBUTED_MEMORY
#define VALID_TYPE       12345
#define BLK_FINDOWN_TYPE 12777
/* Number of bytes at which the nbr-pair method should be used.  This
   depends on the particular machine.  This should be handled in
   pairsync code. */
#ifndef MIN_PAIR_SIZE
#define MIN_PAIR_SIZE 1000
#endif
#endif

#ifdef intelnx
#define BASE_FORCE 0x40000000
#endif

/*
   Here are ( *)malloc(sizeof()) data types and objects.  The intent here is to further
   simplify the process of processing the communication requests, without
   making the "program" structure do too many things.  The actual send/recv
   operations need only a type, buffer, and length; and an id for send.
   Both may need a comm_id if they are asynchronous.

   Code to move between buffers needs the local and global buffer and length.
   
   Each phase may contain 
      async recv
      prepare to send/send
      wait for recvs to complete and move

   NOT USED YET
 */
typedef struct {
    int  type, to, len, rc;
    char *p;
    void *lctx;
    } BCSends;
typedef struct {
    int  type, maxlen, rc;
    char *p;
    void *lctx;
    } BCARecvs;
typedef struct {
    int        nsend, nrecv;
    BCSends    *send;
    BCARecvs   *recv;
    } BCPhaseBlk;
typedef struct {
    int        nphase;
    BCPhaseBlk *phase;
    } BCObject;
/* This is used for sorting the program */
typedef struct {
    BCentry *entry;
    int     order, idx;
    } BC_Sblock;

#endif
