/*              
 *  $Id: dmch.h,v 1.16 1994/05/19 20:21:22 gropp Exp gropp $
 *
 *  (C) 1993 by Argonne National Laboratory and Mississipi State University.
 *      All rights reserved.  See COPYRIGHT in top-level directory.
 */

/***********************************************************************
*                                                                      *
*   dmp4.h                                                             *
*   MPI for MS-Windows 3.1                                             *
*   current version: 0.99b          06/10/95                           *
*                                                                      *
*   Joerg Meyer                                                        *
*   University of Nebraska at Omaha (UNO)                              *
*   Department of Computer Science                                     *
*                                                                      *
*   This is an MPI implementation for MS-Windows 3.1                   *
*   It is based on the MPI implementation from Argonne National        *
*   Laboratory and Mississippi State University, version from          *
*   June 17, 1994. Note their COPYRIGHT.                               *
*   ( source code and user's guide available by anonymous FTP from     *
*     info.mcs.anl.gov in directory /pub/mpi )                         *
*   Anyone is free to copy and modify this code to suit his or her     *
*   own purposes as long as these notices are retained.                *
*                                                                      *
***********************************************************************/

/* 
   These are the Chameleon-specific macros.

   This file also contains some information on the design of the Chameleon/MPI
   transport layer.
 */

/*D
      Chameleon-MPI transport (device) implementation

      This simple Chameleon implementation is meant to work with
      most message passing systems and does not rely on large amounts
      of message buffering.  Messages are sent in one or two packets, depending
      on the length.  If the message is short (< MPID_PACKET_SIZE), then
      the header and message are placed into a single message and sent
      together.  Long messages are sent without copying the message by
      sending the header in a separate packet. 

      This Chameleon implementation depends on adequate buffering by the
      message-passing system.  Because there is no asynchronous notification
      of messages in Chameleon (because most transport layers do not provide
      this important functionality), it is difficult to implement a
      packet-based system (i.e., one that divides messages into multiple 
      packets and manages all buffering).

      Some random comments on the implementation:

      Many things could be done to improve the performance of the 
      implementation.  These include

$        For a nonblocking receive of a long message that is not yet
         available, send a message to the source (if known) with 
         a reserved message type and post a non-blocking receive for that
         type.  This can avoid an extra memory copy in some cases.
         The obvious race conditions must be handled.     
D*/

#ifndef _DMCH_INCLUDED
#define _DMCH_INCLUDED

/* 
   Redefine the names of the async communication types (typedefed in 
   comm$(COMM).h) so that a Chameleon program can include commmpi.h which
   includes this file (through the mpi.h file)  

   I'm not sure that this does what I need, but it doesn't hurt.
 */
#define ASYNCRecvId_t MPIDCH_ASYNCRecvId_t
#define ASYNCSendId_t MPIDCH_ASYNCSendId_t

/* 
   When we compile the device, we want to include all of the device code,
   but when we compile user code, we don't want to require that they load 
   the defintions in either tools.h or comm/comm.h.  
 */
#ifdef MPID_DEVICE_CODE
#include "p4.h"
extern Int __NUMNODES,__MYPROCID,__P4LEN,__P4TYPE,__P4FROM,__P4GLOBALTYPE;
typedef long ASYNCRecvId_t;typedef long ASYNCSendId_t;
#else
/* 
typedef long int;
typedef long int;
 */
#endif /* MPID_DEVICE_CODE */

#include "mpi.h"
#include "dmpiatom.h"


/* 
   This packet size should be selected such that
   (s + r*(n+h)) + c*n \approx (s+r*n) + s+r*h
   where s = latency, r = time to send a byte, n = total message length, 
   h = header size, and c = time to copy a byte.  This condition reduces to
   c n \approx s
   For a typical system with
   s = 30us
   c = .03us/byte
   this gives
   n = s / c = 30 us / (.03us/byte) = 1000 bytes

   When the message does not fit into a single packet, ALL of the message
   should be placed in the "extension" packet (see below).  This removes 
   an extra copy from the code.
 */
#define MPID_PACKET_SIZE 1024


/*
  The implementation reserves some message tags.

  (An optimization is to allow the use of all but a few very large tags
  for messages in the initial communicator, thus eliminating a separate
  header.  Messages in a different communicator would be sent on a reserved
  set of tags.  An alternate is to use the Chameleon tags for communicator
  types, making the message-passing system handle the queueing of messages
  by communicator

  PT2PT_TAG is the tag for short messages and the headers of long messages
  PT2PT2_TAG(src) is the tag for longer messages (by source).  This permits
  the header messages to be freely received into preallocated buffers, and
  for long messages to be received directly into user-buffers.

  The mode field is overloaded for the synchronous case because we support
  NONBLOCKING, SYNCHRONOUS sends; thus, there can be a variety of outstanding
  synchronous sends at any time, and we have to match them up.

  We do this by making the mode field look like this:

  <syncreqnum><modetype>

  The mode field is required because, while there are different sends for each
  mode, there is only one kind of receive, and hence we need the mode field
  to decide what to do.  In fact, our only need is to handle MPI_SYNCHRONOUS
  sends.
 */
#define MPID_PT2PT_TAG 0
#define MPID_PT2PT2_TAG(src) (1+(src))

/* Whether an operation should block or not */
typedef enum { MPID_NOTBLOCKING = 0, MPID_BLOCKING } MPID_BLOCKING_TYPE;

/*
   The mode field contains an ID if the mode is SYNCHRONOUS 
 */
#define MPID_MODE_MASK 0x7
#define MPID_MODE_BITS   3
typedef struct {
    Int len,			/* TOTAL length of message in BYTES */
        tag,			/* Message tag */
        context_id,		/* Internal communicator ID */
        mode,                   /* mode (standard, ready, synchronous,
				   sync_ack) */
        lrank;                  /* rank in sending context */
    char buffer[MPID_PACKET_SIZE];
				/* Minimum message size */
    } MPID_PACKET;
/* HeaderLen is just the length of the envelope of MPID_PACKET */
#define MPID_HEADER_LEN (sizeof(MPID_PACKET)-MPID_PACKET_SIZE)

#define MPID_HEADER_INTS 5    /* Number of ints in the header */
/*
   Another option would be for the device handle to contain the initial
   packet.  I have NOT do this so as to keep down the size of the device
   handle (since I want relatively large packets)

   This needs to be structured so that there is just enough here for mpir.h.
 */
typedef struct {
    MPIR_BOOL     done;             /* done is set when the message has
				       been sent and, if a SYNC mode,
				       the ack has been received */
    Int           is_non_blocking;
    Int sid;              	    /* Id of non-blocking send, if used.
				       0 if no non-blocking send used */
        /* The following describes the buffer to be sent */
    void          far *start;
    Int           bytes_as_contig;
    MPIR_NODETYPE dataelement;      /* Will eventually hold datatype for
				       heterogeneous systems */
    } MPID_SHANDLE;
    
typedef MPID_SHANDLE far *LPMPID_SHANDLE;    
typedef LPMPID_SHANDLE far *LPPMPID_SHANDLE;    

typedef struct {
    MPIR_BOOL     done;
    Int           is_non_blocking;
    char          far *temp;        /* Holds body of unexpected message */
    Int           mode;             /* mode bits and sequence number; needed
				       for unexpected messages */
    Int           from;             /* Absolute process number that sent
				       message; used only for SYNC ack */

        /* The following describes the buffer to be received */
    void          far *start;
    Int           bytes_as_contig;
    MPIR_NODETYPE dataelement;      /* Will eventually hold datatype for
				       heterogeneous systems */

    } MPID_RHANDLE;

#define MPID_MIN(a,b) ((a) < (b) ? (a) : (b))

#define MPID_CAN_SEND_CONTIG

/* 
   Since allocation is done by placing the device structure directly into
   the MPIR_?HANDLE, we don't need to allocate space.  We do, however, take
   this opportunity to initialize it...

   The ...reuse... versions are for persistant handles (e.g., MPI_Send_init)
 */
#define MPID_alloc_send_handle( a ) {(a)->done = MPIR_NO;}
#define MPID_alloc_recv_handle( a ) {(a)->done = MPIR_NO;(a)->temp  = 0;}
#define MPID_free_send_handle( a )  
#define MPID_free_recv_handle( a )  if ((a)->temp  ) {MPI_FREE((a)->temp);}
#define MPID_reuse_send_handle( a ) {(a)->done = MPIR_NO;}
#define MPID_reuse_recv_handle( a ) {(a)->done = MPIR_NO;(a)->temp  = 0;}
#define MPID_set_send_is_nonblocking( a, v ) (a)->is_non_blocking = v
#define MPID_set_recv_is_nonblocking( a, v ) (a)->is_non_blocking = v

/* Contact with the device layer is made here.  These call the
   routines to actually process a message (see dmch.c) 

   We use different names to enable the use of a multi-protocol system
   (planned for future support)
 */
#define MPID_post_send(dmpi_send_handle) \
    MPID_P4_post_send(dmpi_send_handle) 
#define MPID_post_recv(dmpi_recv_handle, is_available ) \
    MPID_P4_post_recv(dmpi_recv_handle, is_available ) 
#define MPID_blocking_recv(dmpi_recv_handle ) \
    MPID_P4_blocking_recv(dmpi_recv_handle) 
#define MPID_complete_recv(dmpi_recv_handle) \
    MPID_P4_complete_recv(dmpi_recv_handle) 
#define MPID_complete_send(dmpi_send_handle,status) \
    MPID_P4_complete_send(dmpi_send_handle,status) 
#define MPID_check_device( blocking ) \
    MPID_P4_check_device( blocking )
#define MPID_iprobe( tag, source, context_id, flag, status ) \
    MPID_P4_iprobe( tag, source, context_id, flag, status ) 
#define MPID_probe( tag, source, context_id, status ) \
    MPID_P4_probe( tag, source, context_id, status ) 

#define MPID_NODE_NAME( name, len ) \
    MPID_P4_Node_name( name, len )
#define MPID_WTIME()         MPID_P4_Wtime()
#define MPID_WTICK()         MPID_P4_Wtick()
#define MPID_INIT(argc,argv) MPID_P4_init( argc, argv )
#define MPID_END()           MPID_P4_End()

#define MPID_ABORT( errorcode ) MPID_P4_Abort( errorcode );

#define MPID_CANCEL( r ) MPID_P4_Cancel( r )

#define MPID_myrank( rank ) MPID_P4_myrank( rank )
#define MPID_mysize( size ) MPID_P4_mysize( size )

/* thread locking.  Single-thread devices will make these empty 
   declarations */
#define MPID_THREAD_LOCK(comm)
#define MPID_THREAD_UNLOCK(comm)

/* This device prefers that the data be prepacked (at least for now) */
#define MPID_PACK_IN_ADVANCE
#define MPID_RETURN_PACKED

/* Forward refs */
/* #ifdef __STDC__ */
extern void MPID_P4_Abort(Int);
extern void MPID_P4_myrank( Int far * ), MPID_P4_mysize( Int far * ), 
            MPID_P4_End(void);
// extern Int	MPID_P4_Cancel( MPIR_COMMON far *r);
// #ifdef __BORLANDC__
extern Int	MPID_P4_Cancel(MPI_Request far *request);
// #else
// extern Int	MPID_P4_Cancel();
// #endif
/*
extern Int MPID_P4_post_send( MPIR_SHANDLE * ), 
           MPID_P4_blocking_recv( MPIR_SHANDLE *), 
           MPID_P4_complete_recv( MPIR_SHANDLE *),
           MPID_P4_complete_send( MPIR_SHANDLE *, MPI_Status *);
 */
/* MPID_P4_check_device( blocking ), 
   MPID_P4_iprobe( tag, source, context_id, flag, status ), 
   MPID_P4_probe( tag, source, context_id, status ) 
 */
/* #else
extern void MPID_P4_Abort();
extern void MPID_P4_myrank(), MPID_P4_mysize(), MPID_P4_End();
#endif */

#endif

#ifdef MPID_DEVICE_CODE
/* Device-only information */
#include "../../include/dmpiatom.h"

/* For heterogeneous support --- NOT YET FULLY IMPLEMENTED 
   (fully means that there is some code that is not yet used).
   This provides information on how data should be communicated to 
   a processor.  The approach is to only convert data when
   the formats are different on the source and destination, and then to
   use byte-swapping code rather than xdr on the SENDER where possible.
   In a heterogeneous environment, the receiver need only check for 
   a sender that had to use MPID_H_XDR; otherwise, the received data is
   already in the correct format.

   None of this code is executed in a homogeneous environment
 */
#ifndef MPID_H_INC
#define MPID_H_INC
typedef enum { MPID_H_NONE = 0, 
		   MPID_H_LSB, MPID_H_MSB, MPID_H_XDR } MPID_H_TYPE;
/* 
   The MPID_INFO structure is acquired from each node and used to determine
   the format for data that is sent 
 */
typedef struct {
    MPID_H_TYPE byte_order;
    } MPID_INFO;
extern MPID_INFO *MPID_procinfo;
extern MPID_H_TYPE MPID_byte_order;

extern void (*MPID_ErrorHandler)(Int, char far *);
extern void MPID_DefaultErrorHandler(Int, char far *);

#if defined(p4) || defined(pvm) || defined(pvm3)
// #define MPID_HAS_HETERO
#endif
#endif /* MPID_H_INC */

#else
/* Allow a Chameleon program to have these symbols */
#undef ASYNCRecvId_t 
#undef ASYNCSendId_t
#endif 

/* #ifdef WIN31
Int		MPID_GetMsgDebugFlag(void);
void	MPID_PrintMsgDebug(void);
Int		MPID_P4_check_incoming (MPID_BLOCKING_TYPE is_blocking);
Int		MPID_P4_blocking_recv(MPIR_RHANDLE far *);
Int		MPID_P4_post_recv(MPIR_RHANDLE far *, Int far *)
Int		MPID_P4_Process_unexpected(MPIR_RHANDLE far *, MPIR_RHANDLE far *)
Int		MPID_P4_post_send (MPIR_SHANDLE far *dmpi_send_handle);
void	MPID_P4_complete_send (MPIR_SHANDLE far *, MPI_Status far *)




void	MPID_SetSyncDebugFlag( Int f);
P4VOID	p4_set_dbg_level(Int level);
#endif */
