/* sbp_multiproc.h */

/* (c) 1994 by bird@cs.uni-sb.de */

/* $Id: sbp_multiproc.h,v 1.15 1995/01/30 13:31:43 bird Exp bird $ */

/*
 * for optimal execution files including "sbp_multiproc.h" should be compiled
 * with compiler-option -O2 (to force function-inlining)
 *
 */

/* 
 * Implementation of several locking mechanisms
 *
 * Implementation of barrier synchronization mechanisms
 *
 * Implementation of a parallel loop mechanism
 *
 * Calculation of a unique id in a process group
 *
 * Implementation of a parallel copy routine
 *
 * Implementation of a parallel fifo queue
 *
 * Implementation of a parallel broadcast queue
 *
 * Implementation of a parallel stack
 *
 */


/* ############### locks ############### */

/*
 *
 * 6 different locks are provided:
 *
 *     simple locks, save locks, fair locks, priority reader/writer-locks,
 *     fair group locks and fair reader writer locks
 *
 * simple locks work faster than save locks and fair locks
 * save locks provide some features useful for debugging
 * fair locks guarantee access to a lock in demand order (when using simple
 *   or save locks, processors with low IDs will always get access before
 *   processors with high IDs because of the multiprefix order)
 *
 * reader/writer-locks provide the following mechanism:
 * - several readers can own the lock simultaneously
 * - a reader and a writer can't own the lock simultaneously
 * - two different writers can't own the lock simultaneously
 *
 * a group lock is used to allow exclusive access to a critical section
 *   to one of several user-defined process groups
 *
 * the locking mechanism is based on the following functions:
 * (MODULO is the modulo bit which toggels on each round)
 *

 initialization of a lock  - must be called before any attempt to catch
 a lock and should only be executed by one process (master)

    void sbp_lock_init(sbp_lock_t *lock)
    void sbp_save_lock_init(sbp_save_lock_t *lock)
    void sbp_fair_lock_init(sbp_fair_lock_t *lock)
    void sbp_rw_lock_init(sbp_rw_lock_t *lock)
	int  sbp_gr_fair_lock_init(sbp_gr_fair_lock_t *gl, unsigned int num_groups)
         a maximum of "num_groups" processes may use the lock "gl"
		   ("num_groups" in {0..0xffffffff})
		 returns 1 on succes and 0 an failure
	int  sbp_rw_fair_lock_init(sbp_rw_fair_lock_t *fl)
		 returns 1 on succes and 0 an failure

 catch a lock - lock must be initialized
 (will loop until the lock could be locked - there's no timeout !) 

    void sbp_lock(sbp_lock_t *lock)
    void sbp_save_lock(sbp_save_lock_t *lock)
    void sbp_fair_lock(sbp_fair_lock_t *lock)
    void sbp_rw_lock(sbp_rw_lock_t *lock, PTYPE)
         "PTYPE" in {SBP_RW_READER, SBP_RW_WRITER)
	void sbp_gr_fair_lock(sbp_gr_fair_lock_t *gl, unsigned int group_id)
	     "group_id" in {0..num_groups-1}
	void sbp_rw_fair_lock(sbp_rw_fair_lock_t *fl, int ptype)
	
 unlock a lock - executing process must be owner of the lock

  simple version/fair version
  ---------------------------  
  (theoretically it's possible that a proccess that doesn't hold a lock l
  can unlock l - this will lead to unpredictable results and must be
  avoided by the programmer)

  this function will always return 0 (for compatibility with the save version)

    int sbp_unlock(sbp_lock_t *lock)
    int sbp_fair_unlock(sbp_fair_lock_t *lock)

  save version
  ------------ 
  (return code:
      0 - unlock was successful
      1 - tried to unlock a lock that wasn't locked
      2 - tried to unlock a lock whose owner was another process
   in case 1 and 2, the state of the lock isn't changed)

    int sbp_save_unlock(sbp_save_lock_t *lock)

  priority-reader/writer-lock
  ------------------

     void sbp_rw_unlock(sbp_rw_lock_t *lock, PTYPE, int wait)
          "PTYPE" in {SBP_RW_READER, SBP_RW_WRITER)
          "wait" is the delay a writer waits before freeing the rw-lock
                 for other writers after having freed it for readers
                 (delay corresponds to wait*3 SBPRAM-rounds
  fair group lock
  ---------------

     void sbp_gr_fair_unlock(sbp_gr_fair_lock_t *gl);

  fair-reader/writer-lock
  -----------------------

     void sbp_rw_fair_unlock(sbp_rw_fair_lock_t *fl, int ptype);

*/

/* ############### barrier synchronization ############### */

/*
 * There are two barrier synchronization mechanisms:
 *
 *  1) interrupt resistent barrier synchronization
 *
 *     Interrupts may be enabled when using this mechanism but its not
 *     guaranteed that all proccesses will run synchronously after
 *     having passed the barrier.
 *
 *  2) perfectly synchronizing barrier mechanism
 *
 *     !!! INTERRUPTS MUST BE DISABLED WHEN USING THIS MECHANISM !!!
 *
 *     Interrupts mustn't be enabled when using this mechanism;
 *     it's garantueed that all processes will run perfectly synchronous
 *     after having passed the barrier.
 */

/* 
 *  the interrupt resistent barrier synchronization is implemented by the
 *  following functions:
 * 
 * (MODULO is the modulo bit which toggels on each round)
 *
 
 initialize a barrier - must be called before the first use of the
 barrier and should only be executed by one process (master)

    void sbp_barrier_init(sbp_barrier_t *b)
 
 barrier synchronization - wait until nprocs processes have executed
 a "sbp_barrier()" with same parameters values for "b" and "nprocs"

 If f != NULL then function "f" will be executed (with argument "farg")
 by the last process entering the barrier; during execution of "f" all
 other processes will hang in the barrier.
  "f" MUSTN'T ATTEMPT TO USE b !!!

 If there are less than nprocs  processes executing "sbp_barrier()",
 "sbp_barrier()" will will loop infinitely; if there are more than nprcos
 processes executing "sbp_barrier()", the behavior is unpredictable !!!

    void sbp_barrier(sbp_barrier_t *b, int nprocs, void (*f)(void *),
                                                                    void *farg)
 There is a function that can be given to sbp_barrier() as argument f
 and which will reset a parallel counter:

 void sbp_counter_set(void *counter_and_value)

 "counter_and_value" must be given to sbp_barrier as ergument "farg"; it
 must point to the first element of an array "unsigned int cnt_set[2]" where
 cnt_set[0] must be set to the adress of the shared counter and cnt_set[1] must
 be set to the value the counter is initialized with

 *
 *  the non interrupt resistent barrier synchronization is implemented by the
 *  following functions:
 *
 *  !!! INTERRUPTS MUST BE DISABLED WHEN USING THIS MECHANISM !!!
 * 
 * (MODULO is the modulo bit which toggels on each round)
 *
 
 initialize a barrier - must be called before the first use of the
 barrier and should only be executed by one process (master)

    void sbp_sync_barrier_init(sbp_sync_barrier_t *sb)
 
 barrier synchronization - wait until nprocs processes have executed
 a "sbp_sync_barrier()" with same parameters values for "sb" and "nprocs";
 all "nprocs" processes will run perfectly synchronous after having
 passed the barrier

 If there are less than nprocs  processes executing "sbp_sync_barrier()",
 "sbp_sync_barrier()" will will loop infinitely; if there are more than nprcos
 processes executing "sbp_sync_barrier()", the behavior is unpredictable !!!

    void sbp_sync_barrier(sbp_sync_barrier_t *sb, int nprocs)

 */


/* ############### parloop ############### */

/*
 * the parallel loop  mechanism is implemented by the
 * following functions:
 * 
 * (MODULO is the modulo bit which toggels on each round)
 *
 
 initialize a parallel loop  - must be called before the first use
 of the loop and should only be executed by one process (master)

    void sbp_parloop_init(sbp_parloop_t *pl);
 
 return next index in parallel loop - the index varies between 0 and
 "max" (inclusively)
 at the end of the loop the "nprocs" processes will be synchronized
 (by barrier synchronization), the loop will be reinitalized  and -1
 will be returned as index
 (this mechanism has the same semantics as the P4-getsub-monitor)

   int sbp_parloop(sbp_parloop_t *pl, int max, int nprocs, int stride)

 */

/* ############# calculation of a unique id in a process group  ############ */

/*
 * unsigned int sbp_get_id_in_group(unsigned int procs_in_group,
 *                                  unsigned int *counter,
 *							        sbp_barrier_t *barrier)
 *
 * function:
 *
 * - calculate  a unique id in a process group PG
 * - synchronize the processes after having calculated their unique id
 *
 * arguments:
 *
 * - "procs_in_group" specifies the number of processes in the PG
 * - "counter" points to counter in shared memory
 *   it must have the same value for all processes of PG
 * - "barrier" points to a barrier that is used to synchronize the processes
 *   it must have the same value for all processes of PG
 *
 * return:
 *
 * - returns the unique id for the process
 * - as a side effect *counter is set to the number of processes
 *   in the process group
 *
 */

/* ############### Implementation of a parallel copy routine ############### */

/*
 * void sbp_par_copy_1_n(unsigned int *source_adr, unsigned int copy_len,      
 *                       unsigned int **destination_adr, unsigned int num_dest,
 *                       unsigned int num_procs,                               
 *                       shared unsigned int *counter,                         
 *                       shared sbp_barrier_t *barrier)                        
 *
 * function:
 *
 * - makes num_dest copies of a memory area of len copy_len starting at
 *   adress source_adr
 * - each element of the array destination_adr points to the start adress
 *   of one of the num_dest destination areas
 * - num_procs is the number of processors involved in copying (it must
 *   not necessarily be equal to num_dest)
 *
 * preconditions:
 *
 * - *counter must be initialized with 0
 * - *barrier must be initialized
 *
 * restrictions:
 *
 * - NONE OF THE SOURCE AREA AND THE DESTINATION AREAS MUST OVERLAP
 * - THE SOURCE ADRESS AND THE DESTINATION ADRESSES MUST NOT BE PRIVATE
 * - ADRESSES
 *
 */

/* ############ parallel queue/broadcast queue, parallel stack ############# */

/*
 * parallel queue:
 *
 *   Implementation of a FIFO-like parallel queue with internal remove
 *   of queued elements
 *
 * broadcast queue:
 *
 *   Implementation of a queue where each queued element is read by
 *   all processes of a certain process group
 *
 * parallel stack:
 *   Implementation of a parallel LIFO queue
 *
 * - types for queueable/stackable objects can be defined by using the 
 *   "SBP_PQ_TYPEDEF"-, the "SBP_BQ_TYPEDEF- or the SBP_ST_TYPEDEF macro;
 *   the three macros are intechangeable: if e is an element of a type that has
 *   been defined by using one of the two macros, e can be put both in
 *   a parallel queue, a broadcast queue or a parallel stack
 *   
 * - objects of different user defined types can be put into the same
 *   queue/stack  (the user himself must provide a mechanism to recognize
 *   the type of an element e.g. by using an integer value as type in each
 *   queued element)
 *
 */


/* ================================= */
/*   macro & function descriptions   */
/* ================================= */


/*
 * SBP_PQ_TYPEDEF(typename, items...)/SBP_BQ_TYPEDEF(typename, items...)
 * SBP_ST_TYPEDEF(typename, items...)
 *
 * Defines a queuable/stackable type with items "items..."
 *
 *   This macro takes a type name as its first argument and the following
 *   arguments define the user data area - here a typical definition:
 *   (As a side effect, a struct of name "typename" is defined; this struct
 *    can be used to create self-referring queuable data types.)
 *
 *   Example:
 *
 *	SBP_PQ_TYPEDEF(mytype,
 *		       int   item_1;
 *		       float item_2;
 *		       char  item_3[10];
 *             struct mytype *next;
 *		      )
 *   The new type "mytype" can be used like a "normal" C type, e.g.:
 *
 *	mytype *ele1 = (mytype *) sbo_shmalloc(sizeof(mytype));
 *	mytype *ele2 = (mytype *) sbo_shmalloc(sizeof(mytype));
 *	ele1->item_1 = 4711;
 *	ele1->item_2 = 08.15;
 *	strcpy(&(ele1->item_3), "hello !");
 *  ele1->next = ele2;
 *
 *   ############### IMPORTANT - IMPORTANT IMPORTANT ###############
 *
 *   - WHEN USING THE "sbp_pq_remove()"-FUNCTION A QUEUE ELEMENT NEVER
 *     MUST BE FREED TO THE SHARED MEMORY MANAGEMENT SYSTEM (BY USING
 *     THE "shfree"-FUNCTION). THAT'S THE ONLY WAY TO GUARANTEE CORRECT
 *     BEHAVIOR OF THE PARALLEL QUEUE
 *
 *   - ALWAYS USE SYMBOLIC NAMES TO REFER TO USER DEFINED ITEMS IN AN OBJECT
 *     OF AN TYPE THAT HAS BEEN DEFINED BY "SBP_PQ_TYPEDEF" - NEVER USE
 *     POINTER ARITHMETIC !!!
 *
 *     e.g. (cf. example above):
 *
 *	   ele[0] != ele->item_1;
 *	   *(((char *) ele) + sizeof(int)) != ele->item_2
 *
 * ############### IMPORTANT - IMPORTANT IMPORTANT ###############
 *   
 */

/* -------------------- parallel queue -------------------- */

/*
 * int sbp_pq_init(sbp_pq *pq, unsigned int queuesize)
 *
 * sbp_pq_par_init SHOULD BE PREFERRED SINCE IT IS FASTER
 *
 * function:
 *
 *   - initializes the parallel queue *pq (which must be situated in shared
 *                                         memory, not in private memory)
 *   - "queuesize" must be >= 1
 *   - "queuesize" is rounded to the next power of two
 *      (the queue consists of "queuesize" lists of elements which can
 *       be parallely accessed)
 *
 * return value:
 *
 *   - returns TRUE if the initialization of queue *pq was succesful
 *   - returns FALSE otherwise
 *
 */

/* 
 * int sbp_pq_par_init(sbp_pq *pq, unsigned int queuesize,
 *                    sbp_barrier_t *barrier, unsigned int num_init_procs)
 *
 * function:
 *
 *   - like sbp_pq_init() but must be called by num_init_procs to parallely
 *     initialize the parallel q *pq
 *
 *   - barrier MUST BE INITIALIZED BEFORE ANY PROCESS CALLS THIS FUNCTION
 *
 * return value:
 *
 *   - like sbp_pq_init()
 * 
 *
 */

/*
 * int sbp_pq_noe(sbp_pq *pq)
 *
 * function:
 *
 *   - returns the number of elements stored in the parallel queue *pq
 *     (this is only a snapshot!)
 *
 * remark:
 *
 *      THIS FUNCTIONS MAY RETURN A NUMBER >0 EVEN
 *      IF THERE IS NO ELEMENT IN THE QUEUE
 *
 *   (this only happens if some element has been removed from the queue
 *    by using the "sbp_pq_remove()" operation and this removal has not
 *    yet been detected by a "sbp_pq_read()")
 *
 */

/*
 * int sbp_pq_read(sbp_pq *pq, void *ele_pt_pt)
 *
 * arguments:
 *
 *   - ele_pt_pt must be a pointer to a pointer to an object of type t,
 *     where t has been defined by using SBP_PQ_TYPEDEF, SBP_BQ_TYPEDEF
 *     or SBP_ST_TYPEDEF
 *
 * function:
 *
 *   - tries to extract an element from the parallel queue *pq
 *
 * return:
 *
 *   there're three cases:
 *
 *   - function returns 1 (TRUE) and *ele_pt_pt != NULL
 *     ==> read was succesfull, *ele_pt_pt points to the dequeued element
 *   - function returns 1 (TRUE) and *ele_pt_pt == NULL
 *     ==> tried to read an element which has been removed by a
 *         "sbp_pq_remove()" before
 *   - function returns 0 (*ele_pt_pt is not changed)
 *     ==> read was not succesfull
 *
 */

/*
 * void sbp_pq_write(sbp_pq *pq, void *ele_pt)
 *
 * arguments:
 *
 *   - ele_pt must be a pointer to an object of type t,
 *     where t has been defined by using SBP_PQ_TYPEDEF or SBP_PQ_TYPEDEF
 *     or SBP_ST_TYPEDEF
 *
 * function:
 *
 *   - writes the element "*ele_pt" into the parallel queue "*pq"
 *
 */

/*
 * int sbp_pq_remove(sbp_pq *pq, void *ele_pt)
 *
 * arguments:
 *
 *   - ele_pt must be a pointer to an object of type t,
 *     where t has been defined by using SBP_PQ_TYPEDEF
 *
 * function:
 *
 *   - tries to delete element "*ele_pt" in the parallel queue "*pq"
 *
 * return value:
 *
 *   - TRUE (1) if the deletion was succesful
 *   - FALSE (0) otherwise (i.e. the element was not in queue "*pq")
 *
 */

/* -------------------- broadcast queue -------------------- */

/*
 * int sbp_bq_init(sbp_bq *bq, unsigned int queuesize,
 *                 unsigned int procs_in_group)
 *
 * sbp_pq_par_init SHOULD BE PREFERRED SINCE IT IS FASTER
 *
 * function:
 *
 *   - initializes the broadcast queue *bq (which must be situated in shared
 *                                          memory, not in private memory)
 *   - "queuesize" must be >= 1
 *   - "queuesize" is rounded to the next power of two
 *      (the queue consists of "queuesize" lists of elements which can
 *       be parallely accessed)
 *   - "procs_in_group" specifies the number of processes in the process group
 *     accessing the broadcast queue *bq
 *
 * return value:
 *
 *   - returns TRUE if the initialization of queue *bq was succesful
 *   - returns FALSE otherwise
 *
 */

/*
 * int sbp_bq_par_init(sbp_bq *bq, unsigned int queuesize,
 *                     unsigned int procs_in_group,
 *                     sbp_barrier_t *barrier, unsigned int num_init_procs)
 *
 *
 * function:
 *
 *   - like sbp_bq_init() but must be called by num_init_procs to parallely
 *     initialize the broadcast q *bq
 *
 *   - barrier MUST BE INITIALIZED BEFORE ANY PROCESS CALLS THIS FUNCTION
 *
 * return value:
 *
 *   - like sbp_bq_init()
 * 
 *
 */


/*
 * int sbp_bq_noe(sbp_bq *bq, unsigned int id_in_group);
 *
 * function:
 *
 *   - returns the number of elements stored in the broadcast queue *bq
 *     for the calling process (the number may vary among the different
 *     processes of the process group using *bq)
 *     (this is only a snapshot!)
 *
 *   - "id_in_group" is the unique id of the calling process in the
 *     process group using the broadcast queue *bq
 *     (it's a number between 0 and [the value given by "procs_in_group"
 *      when calling "sbp_bq_init()"]-1)
 *
 */

/*
 * int sbp_bq_read(sbp_bq *bq, void *ele_pt_pt, void *(*ele_copy)(void *),
 *                 unsigned int id_in_group)
 *
 * arguments:
 *
 *   - ele_pt_pt must be a pointer to a pointer to an object of type t,
 *     where t has been defined by using SBP_BQ_TYPEDEF or SBP_PQ_TYPEDEF
 *     or SBP_ST_TYPEDEF
 *   - "id_in_group" is the unique id of the calling process in the
 *     process group using the broadcast queue *bq
 *     (it's a number between 0 and [the value given by "procs_in_group"
 *      when calling "sbp_bq_init()"]-1)
 *   - ele_copy must be a pointer to a user provided function that copies
 *     an element e of type t and returns the copy;
 *     this function must copy e item by item, not by using a function like
 *     memcpy() since this could cause runtime errors in the SBPRAM-network
 *
 *     example:
 *
 *      + CORRECT version:
 *
 *        void *ele_copy(void *ele)
 *        {
 *            t *old_ele, *new_ele;
 *
 *            old_ele = (t *)ele     // "cast" ele to type t
 *            new_ele = ele_alloc(); // must be provided by the user
 *            new_ele->item_1 = old_ele->item_1;
 *            new_ele->item_2 = old_ele->item_2;
 *                            .
 *                            .
 *                            .
 *            new_ele->item_n = old_ele->item_n;
 *
 *            return new_ele;
 *        }
 *
 *      + WRONG version:
 *
 *        void *ele_copy(void *ele)
 *        {
 *            t *old_ele, *new_ele;
 *
 *            old_ele = (t *)ele     // "cast" ele to type t
 *            new_ele = ele_alloc(); // must be provided by the user
 *
 *  WRONG ==> memcpy(new_ele, old_ele, sizeof(t)); // THIS MIGHT PRODUCE
 *                                                 // RUNTIME ERRORS !!!
 *            return new_ele;
 *        }
 * 
 *
 * function:
 *
 *   - tries to extract an element from the broadcast queue *bq
 *   - an element e is extracted, even when the calling process has
 *     queued e by calling sbp_bq_write(bq, e)
 *     (in terms of message passing: the broadcasted element is also
 *      sent to the initiator of the broadcast)
 *
 * return:
 *
 *   - function returns 1 (TRUE)
 *     ==> read was succesfull, *ele_pt_pt points to the dequeued element
 *   - function returns 0
 *     ==> read was not succesfull, *ele_pt_pt is not changed
 *
 * remark:
 *
 *   AN ELEMENT E IS NOT REALLY DEQUEUED UNTIL ALL PROCESSES OF THE
 *   PROCESS GROUP HAVE SEEN E; 
 *   ONLY THE LAST PROCESS THAT READS E DEQUEUES E - ALL OTHER PROCESSES
 *   GET A COPY OF E
 *
 */

/*
 * void sbp_bq_write(sbp_bq *bq, void *ele_pt)
 *
 * arguments:
 *
 *   - ele_pt must be a pointer to an object of type t,
 *     where t has been defined by using SBP_BQ_TYPEDEF or SBP_PQ_TYPEDEF
 *     or SBP_ST_TYPEDEF
 *
 * function:
 *
 *   - writes the element "*ele_pt" into the broadcast queue "*bq"
 *
 */

/* -------------------- parallel stack -------------------- */


/*
 * int sbp_st_init(sbp_st *st, unsigned int stacksize)
 *
 * sbp_st_par_init SHOULD BE PREFERRED SINCE IT IS FASTER
 *
 * function:
 *
 *   - initializes the parallel stack *st (which must be situated in shared
 *                                         memory, not in private memory)
 *   - "stacksize" must be >= 1
 *   - "stacksize" is rounded to the next power of two
 *      (the stack consists of "stacksize" lists of elements which can
 *       be parallely accessed)
 *
 * return value:
 *
 *   - returns TRUE if the initialization of stack *st was succesful
 *   - returns FALSE otherwise
 *
 */

/* 
 * int sbp_st_par_init(sbp_st *st, unsigned int stacksize,
 *                     sbp_barrier_t *barrier, unsigned int num_init_procs)
 *
 * function:
 *
 *   - like sbp_st_init() but must be called by num_init_procs to parallely
 *     initialize the parallel stack *st
 *
 *   - barrier MUST BE INITIALIZED BEFORE ANY PROCESS CALLS THIS FUNCTION
 *
 * return value:
 *
 *   - like sbp_st_init()
 * 
 */



/*
 * int sbp_st_noe(sbp_st *st);
 *
 *
 * function:
 *
 *   - returns the number of elements stored in the parallel stack *st
 *     (this is only a snapshot!)
 *
 */


/*
 * int sbp_st_pop(sbp_st *st, void *ele_pt_pt);
 *
 * arguments:
 *
 *   - ele_pt_pt must be a pointer to a pointer to an object of type t,
 *     where t has been defined by using SBP_PQ_TYPEDEF, SBP_BQ_TYPEDEF
 *     or SBP_ST_TYPEDEF
 *
 * function:
 *
 *   - tries to pop an element from the parallel stack *st
 *
 * return:
 *
 *   - function returns 1 (TRUE)
 *     ==> pop was succesfull, *ele_pt_pt points to the popped element
 *   - function returns 0 (*ele_pt_pt is not changed)
 *     ==> read was not succesfull
 *
 */


/*
 * void sbp_st_push(sbp_st *st, void *ele_pt);
 *
 * arguments:
 *
 *   - ele_pt must be a pointer to an object of type t,
 *     where t has been defined by using SBP_PQ_TYPEDEF or SBP_PQ_TYPEDEF
 *     or SBP_ST_TYPEDEF
 *
 * function:
 *
 *   - pushes the element "*ele_pt" onto the parallel stack "*st"
 *
 */

/*
   EODOKU (for documentation)
*/

/*
   ######################################################################

 			    begin of header file

   ######################################################################
*/

#ifndef _SBP_MULTIPROC_OLDQUEUE_H_
#define _SBP_MULTIPROC_OLDQUEUE_H_

#ifdef _SBP_MULTIPROC_C_
#define EXTERN_INLINE extern
#else
#define EXTERN_INLINE extern inline
#endif

#include <stddef.h>
#include <sbp/sbp_limits.h>
#include <sbp/sbp_gop.h>
#include <syscall.h>

/* ############### definition of simple lock  ############### */

/* 
 * we use two global operations to access the memory cell,
 * holding the lock: "stg" and "mpmax"
 *
 * to be sure that there won't occur an error in a sorting node, in a
 * network node or in a memory unit during access to the memory cell,
 * we'll do "stg"/"mpmax" only if MODULO == 0/1
 *
 */

/*
 * sbp_lock_t
 */

#define sbp_lock_t int

EXTERN_INLINE void sbp_lock_init(sbp_lock_t *lock)
{
   *lock = 0;

} /* sbp_lock_init */

/*
 * sbp_lock()
 */

EXTERN_INLINE void sbp_lock(sbp_lock_t *lock)
{

   int dummy; /* to make it work correctly on compiler-optimizations */

   asm volatile ("bms 0            \n" /* do mpmax only if MODULO == 1 */
                 "\tmpmax %2, 0, %0\n" /* try to catch lock        */
                 "\tnop            \n" /* delay slot               */
                 "\tadd %0, 0, r0  \n" /* set cc                   */
                 "\tbne -3          "  /* retry if lock was locked */
                  : "=&r" (dummy)  /* %0 */
                  : "0"   (1),     /* %1 */
                    "r"   (lock)   /* %2 */
                  : "cc", "memory"
                );

} /* sbp_lock */

/*
 * sbp_unlock()
 */

EXTERN_INLINE int sbp_unlock(sbp_lock_t *lock)
{
   asm volatile ("bmc 0          \n" /* do stg only if MODULO == 0 */
                 "\tstg %1, %0, 0  " /* unlock: store 0 into lock  */
                  :
                  : "r" (lock),  /* %0 */
                    "r" (0)      /* %1 */
                  : "cc", "memory"
                 );

   return 0;

} /* sbp_unlock */

/* ############### definition of save lock  ############### */

/* 
 * we use "stg" and "mpmax" to access the memory cell holding the lock
 * we use "ldg" and "stg" to access the memory cell holding the owner
 *
 * to be sure that there won't occur an error in a sorting node, in a
 * network node or in a memory unit during access to the memory cell holding
 * the lock(owner) we'll do "stg"/"mpmax" ("ldg"/"stg") only if MODULO == 0/1
 *
 */

/*
 * sbp_save_lock_t
 */

typedef struct
{
  int cell;   /* memory cell holding the lock */
  int owner;  /* process-id of owner of lock  */

} sbp_save_lock_t;

/*
 * sbp_save_lock_init()
 */

EXTERN_INLINE void sbp_save_lock_init(sbp_save_lock_t *lock)
{
   lock->owner = -1;
   lock->cell = 0;

} /* sbp_save_lock_init */

/*
 * sbp_save_lock()
 */

EXTERN_INLINE void sbp_save_lock(sbp_save_lock_t *lock)
{
   int procid = sys_getnr(); /* get number of virtual processor */
   int dummy; /* to make it work correctly on compiler-optimizations */

   asm volatile ("bms 0            \n" /* do mpmax only if MODULO == 1 */
                 "\tmpmax %2, 0, %0\n" /* try to catch lock        */
                 "\tnop            \n" /* delay slot               */
                 "\tadd %0, 0, r0  \n" /* set condition code       */
                 "\tbne -3         \n" /* retry if lock was locked */
                 "\tstg %3, %4, 0    " /* store procid in lock->owner
                                          MODULO == 1 !!! */
                  : "=&r" (dummy)          /* %0 */
                  : "0"   (1),             /* %1 */
                    "r"   (&(lock->cell)), /* %2 */
                    "r"   (procid),        /* %3 */
                    "r"   (&(lock->owner)) /* %4 */
                  : "cc", "memory"
                );

} /* sbp_save_lock */

/*
 * sbp_save_unlock()
 */

EXTERN_INLINE int sbp_save_unlock(sbp_save_lock_t *lock)
{
   int lock_value;
   int lock_owner;

   /* first we'll get the actual value of the lock */

   asm volatile ("bms 0            \n" /* do mpmax only if MODULO == 1 */
                 "\tmpmax %3, 0, %0\n" /* mpmax 0 won't change state of lock */
                 "\tldg   %4, 0, %1\n" /* ### delay slot ### 
                                          lock_owner = lock->owner 
                                          MODULO == 0 !!!        */
                 "\tnop              " /* delay slot             */
                  : "=&r" (lock_value),   /* %0 */
                    "=&r" (lock_owner)    /* %1 */
                  : "0"  (0),             /* %2 */
                    "r"  (&(lock->cell)), /* %3 */
                    "r"  (&(lock->owner)) /* %4 */
                  : "cc", "memory"
                );
 
   if (lock_value == 0)
   /*
    * try to unlock lock which isn't locked 
    *
    * we mustn't change the state of the lock because it could have been
    * locked by another process meanwhile
    *
    */
   {
      return 1;
   }
   else if (lock_owner != sys_getnr())
   /*
    * try to unlock lock that the executing process doesn't own
    *
    * we mustn't change the state of the lock
    *
    */
   {
      return 2;
   }
   else
   /* executing process is owner of lock - unlock lock */
   {
      asm volatile ("bms 0          \n" /* stg in owner only if MODULO == 1 */
                    "\tstg %3, %2, 0\n" /* store -1 into lock->owner        */
                    "\tstg %1, %0, 0  " /* unlock: store 0 into lock->cell  */
                     :
                     : "r" (&(lock->cell)),  /* %0 */
                       "r" (0),              /* %1 */
                       "r" (&(lock->owner)), /* %2 */
                       "r" (-1)              /* %3 */
                     : "cc", "memory"
                   );
      return 0;
   }

} /* sbp_save_unlock */

/* ############### definition of fair lock  ############### */

/* 
 * we use "mpadd" to access the memory cell holding the next free number
 * we use "ldg" and "stg" to access the memory cell holding the number
 * of the actual owner of the lock
 *
 * to be sure that there won't occur an error in a sorting node, in a
 * network node or in a memory unit during access to the memory cell holding
 * the actnum we'll do "syncadd"/"ldg" only if MODULO == 0/1
 *
 */

/*
 * sbp_fair_lock_t
 */

typedef struct
{
   int nextnum; /* next free number            */
   int actnum;  /* number of actual lock owner */

} sbp_fair_lock_t;

/*
 * sbp_fair_lock_init()
 */

EXTERN_INLINE void sbp_fair_lock_init(sbp_fair_lock_t *lock)
{
   lock->nextnum = 0;
   lock->actnum  = 0;

} /* sbp_fair_lock_init */

/*
 * sbp_fair_lock()
 */

EXTERN_INLINE void sbp_fair_lock(sbp_fair_lock_t *lock)
{
   int dummy1; /* to make it work correctly on compiler-optimizations */
   int dummy2;

   asm volatile ("mpadd %3, 0, %0\n" /* get next number              */
                 "\tbms 0          \n" /* delay-slot                 */
                 "\tldg %4, 0, %1  \n" /* get actnum (MODULO == 1 !) */
                 "\tnop            \n" /* delay slot                 */
                 "\tsub %1, %0, r0 \n" /* mynum - actnum             */
                 "\tbne -3           " /* not my turn -> loop        */

                  : "=&r" (dummy1),           /* %0 */
                    "=&r" (dummy2)            /* %1 */
                  : "0"   (1),                /* %2 */
                    "r"   (&(lock->nextnum)), /* %3 */
                    "r"   (&(lock->actnum))   /* %4 */
                  : "cc", "memory"
                );

} /* sbp_fair_lock */

/*
 * sbp_fair_unlock()
 */

EXTERN_INLINE int sbp_fair_unlock(sbp_fair_lock_t *lock)
{
   asm volatile ("bmc 0              \n" /* do ldg only if MODULO == 1 */
                 "\tsyncadd %1, %0, 0\n"     /* get actnum */
                  :
                  : "r"   (&(lock->actnum)), /* %0 */
                    "r"   (1)                /* %1 */
                  : "cc", "memory"
                );
 
   return 0;

} /* sbp_fair_unlock */


/* ############## definition of priority-reader/writer-lock  ############## */

/* 
 * we use "mpadd" and "syncor" to acces the memory cell holding the number
 * of readers owning the lock and the writer-acces-flag
 *
 * to be sure that there won't occur an error in a sorting node, in a
 * network node or in a memory unit during access to the memory cell holding
 * the reader counter we'll do "syncor"/"mpadd" only if MODULO == 0/1
 *
 */

#define SBP_RW_READER	0
#define SBP_RW_WRITER	1
#define ___RW_FLAG___	0x40000000 /* 2^30 */
                        /* we can't use 2^31 since we have to calculate
                           corrrectly -(___RW_FLAG___) */
/*
 * sbp_rw_lock_t
 */

typedef struct
{
   unsigned int    reader_cnt;  /* Bits 0-29 are used as reader counter
				   Bit 30 is used as writer-flag */
   sbp_fair_lock_t writer_lock; /* number of actual lock owner */

} sbp_rw_lock_t;

/*
 * sbp_rw_lock_init()
 */

EXTERN_INLINE void sbp_rw_lock_init(sbp_rw_lock_t *lock)
{
   lock->reader_cnt = 0;
   sbp_fair_lock_init(&(lock->writer_lock));
}

/*
 * sbp_rw_lock()
 */

EXTERN_INLINE void sbp_rw_lock(sbp_rw_lock_t *lock, int ptype)
{
   int wait;

   if(ptype == SBP_RW_READER)
   /* reader-lock */
   {
      wait = 1; /* wait = TRUE */
      while(wait)
      {
         wait = 0; /* wait = FALSE */

         /* wait for writer to leave the lock */

         while(sbp_mpadd_m1(&(lock->reader_cnt), 0) & ___RW_FLAG___);

         /* writer has finished - now try to catch lock */

         if(sbp_mpadd_m1(&(lock->reader_cnt), 1) & ___RW_FLAG___)
         /* another writer was faster than me */
         {
            sbp_mpadd_m1(&(lock->reader_cnt), -1); /* undo lock */
            wait = 1; /* wait = TRUE */
         }
      }
   }
   else
   /* writer-lock */
   {

      /* wait for writer to leave the lock */

      sbp_fair_lock(&(lock->writer_lock));

      /*  set flag to indicate to the readers that I want to have the lock */

      sbp_syncor_m0(&(lock->reader_cnt), ___RW_FLAG___);

      /* wait for readers to leave the lock */

      while(sbp_mpadd_m1(&(lock->reader_cnt), 0) & (___RW_FLAG___-1));
   }

} /* sbp_rw_lock() */

/*
 * sbp_rw_unlock()
 */

EXTERN_INLINE void sbp_rw_unlock(sbp_rw_lock_t *lock, int ptype, int wait)
{
   if(ptype == SBP_RW_READER)
   /* reader-unlock */
   {
      sbp_mpadd_m1(&(lock->reader_cnt), -1);
   }
   else
   /* writer-unlock */
   {
      int i;

      /* free lock for readers */

      sbp_mpadd_m1(&(lock->reader_cnt), -___RW_FLAG___);      

      /* wait loop */

      for(i = 0; i < wait; i++);

      /* free for writers */

      sbp_fair_unlock(&(lock->writer_lock));
   }

} /* sbp_rw_unlock() */

/* ############### definition of fair group lock  ############### */

/*
 * sbp_gr_fair_lock_t
 *
 */

typedef struct
{
	unsigned int  ticket;
	unsigned int  next;
	unsigned int  act_group;
	unsigned int  act_diff;
    unsigned int *group_cnt;

} sbp_gr_fair_lock_t;

/*
 * sbp_gr_fair_lock_init()
 *
 * highest group number: 0xfffffffe
 *
 */

extern inline void *shmalloc(unsigned int size); /* we need this prototype !!! */

EXTERN_INLINE int sbp_gr_fair_lock_init(sbp_gr_fair_lock_t *gl, unsigned int num_groups)
{
	unsigned int *pt, i;
	int rc;

	if((gl->group_cnt=
		      (unsigned int *)shmalloc(num_groups*sizeof(unsigned int)))
	   == NULL)
	/* allocating shared memory failed */
	{
		rc = 0;
	}
	else
	{
		gl->ticket = 0;
		gl->next = 0;
		gl->act_group = 0xffffffff; /* to be sure that no process accidentally
									   enters the lock the first time it calls
									   sbp_gr_fair_lock() */
		gl->act_diff = 0;
		
		for(i = num_groups, pt=gl->group_cnt; i > 0; i--)
		{
			*pt++ = 0;
		}
		rc = 1;
	}
	
	return rc;
	
} /* sbp_gr_fair_lock_init() */


/*
 * sbp_gr_fair_lock()
 *
 */

EXTERN_INLINE void sbp_gr_fair_lock(sbp_gr_fair_lock_t *gl, unsigned int my_group)
{
	unsigned int my_ticket, my_gcount, my_diff;
	register unsigned int next_ticket, act_group, act_diff;

#if 0
	my_ticket = sbp_mpadd(&(gl->ticket), 1);     /* die beiden muessen       */
	my_gcount = sbp_mpadd(&(gl->group_cnt[my_group]), 1);  /* ohne Interrupt passieren */
#endif

	asm volatile("bmc\t0             \n"    /* do the next two instruc-  
											   tions without interrupt */
				 "\tmpadd\t%2, 0, %0 \n"
				 "\tmpaddn\t%3, 0, %1\n"
				 : "=&r" (my_ticket),                  /* %0 */
				   "=&r" (my_gcount)                   /* %1 */
				 : "r"   (&(gl->ticket)),              /* %2 */
				   "r"   (&(gl->group_cnt[my_group])), /* %3 */
				   "0"   (1),
				   "1"   (1)
				);
	
	my_diff   = my_ticket - my_gcount;

	do
	{
#if 0
		next_ticket = gl->next;
		act_group   = gl->act_group;
		act_diff    = gl->act_diff;
#endif
		next_ticket = sbp_ldg_m1(&(gl->next));
		
		asm volatile("bmc\t0           \n"    /* do the next two instruc-  
					                             tions without interrupt */
					 "\tmpadd\t%2, 0, %0 \n"
					 "\tmpaddn\t%3, 0, %1\n"
					 : "=&r" (act_group),        /* %0 */
					   "=&r" (act_diff)          /* %1 */
					 : "r"   (&(gl->act_group)), /* %2 */
					   "r"   (&(gl->act_diff)),  /* %3 */
					   "0"   (0),
					   "1"   (0)
					);

	} while(!((next_ticket == my_ticket)
			  || ((act_group == my_group) && (act_diff == my_diff))));

	/* when we're here we can be sure that we can enter the lock */

	if((next_ticket == my_ticket) && (act_group != my_group))
	/* change group */
	{
		unsigned int scratch1, scratch2;
#if 0
		gl->act_group = -1;
		gl->act_diff  = my_diff;
		gl->act_group = my_group;
#endif
		/* gl->act_group and gl->act_diff can not have changed since 
		   the last time I read them
		   ==> I can use the values of act_group and act_diff */
	
		asm volatile("bmc\t0            \n" /* do the next two instruc-
											   tions without interrupt */
					 "\tmpadd\t%2, 0, %0\n"
					 "\tmpaddn\t%3, 0, %1\n"
					 : "=&r" (scratch1),       /* %0 */
					   "=&r" (scratch2)        /* %1 */
					 : "r" (&(gl->act_group)), /* %2 */
					   "r" (&(gl->act_diff)),  /* %3 */
					   "0" (my_group-act_group),
					   "1" (my_diff-act_diff)
					);
		}

} /* sbp_gr_fair_lock() */


/*
 * sbp_gr_fair_unlock()
 *
 */

EXTERN_INLINE void sbp_gr_fair_unlock(sbp_gr_fair_lock_t *gl)
{
	sbp_syncadd_m0(&(gl->next), 1);
	
} /* sbp_gr_fair_unlock() */


/* ############### definition of fair reader/writer-lock ############### */

/*
 * sbp_rw_fair_lock_t
 *
 */

typedef struct
{
	sbp_gr_fair_lock_t group_lock;
	sbp_fair_lock_t writer_lock;

} sbp_rw_fair_lock_t;

/*
 * sbp_rw_fair_lock_init()
 *
 */

EXTERN_INLINE int sbp_rw_fair_lock_init(sbp_rw_fair_lock_t *fl)
{
	if(!sbp_gr_fair_lock_init(&(fl->group_lock), 2))
	{
		return 0;
	}
	else
	{
		sbp_fair_lock_init(&(fl->writer_lock));
		return 1;
	}

} /* sbp_rw_fair_lock_init() */


/*
 * sbp_rw_fair_lock()
 *
 */

EXTERN_INLINE void sbp_rw_fair_lock(sbp_rw_fair_lock_t *fl, int ptype)
{
	sbp_gr_fair_lock(&(fl->group_lock), ptype);

	if(ptype == SBP_RW_WRITER)
	{
		sbp_fair_lock(&(fl->writer_lock));
	}
	
} /* sbp_rw_fair_lock() */


/*
 * sbp_rw_fair_unlock()
 *
 */

EXTERN_INLINE void sbp_rw_fair_unlock(sbp_rw_fair_lock_t *fl, int ptype)
{

	if(ptype == SBP_RW_WRITER)
	{
		sbp_fair_unlock(&(fl->writer_lock));
	}

	sbp_gr_fair_unlock(&(fl->group_lock));

} /* sbp_rw_fair_unlock() */


/* ############### barrier synchronization ############### */

/* =============== interrupt resistent barrier =============== */

/*
 * sbp_barrier_t
 */

typedef struct
{
	int count;
	int sema;

} sbp_barrier_t;

/*
 * sbp_barrier_init()
 */

EXTERN_INLINE void sbp_barrier_init(sbp_barrier_t *b)
{
   b->count = 0;
   b->sema = 0;

} /* sbp_barrier_init */
 
/*
 * sbp_barrier()
 */

EXTERN_INLINE void sbp_barrier(sbp_barrier_t *b, int nprocs,
                                            void (*f)(void *), void *farg)
{
   int notlast;
   int dummy1; /* to make it work correctly on compiler-optimizations */
   int dummy2; /* to make it work correctly on compiler-optimizations */

   asm volatile ("bms 0             \n"
                 "\tldg %4, 0, %0   \n"  /* get b->sema             */
                 "\tnop             \n"  /* delay slot              */
                 "\tadd %0, 0 ,r0   \n"  /* set cc (b->sema==0)?    */
                 "\tbne -3          \n"  /* semaphore not free      */
                 "\tmpadd %3, 0, %1 \n"  /* increment b->count,
                                            get old counter value   */
                 "\tnop             \n"  /* delay slot              */
                 "\tsub %1, %6, %2  \n"  /* set cc (oldcnt == n-1)? */

                 : "=&r" (dummy1),    /* %0 - rp */
                   "=&r" (dummy2),    /* %1 - rpc */
                   "=r"  (notlast)    /* %2 */
                 : "r" (&(b->count)), /* %3 */
                   "r" (&(b->sema)),  /* %4 */
                   "1" (1),           /* %5 - rpc */
                   "r" (nprocs-1)     /* %6 */
                 : "cc", "memory"
                );

   if(notlast)
   /* not last process entering barrier => wait for last process */
   {
      asm volatile ("bms 0             \n"
                    "\tldg %1, 0, %0   \n"  /* get b->sema             */
                    "\tnop             \n"  /* delay slot              */
                    "\tadd %0, 0 ,r0   \n"  /* set cc (b->sema==0)?    */
                    "\tbeq -3          \n"  /* semaphore is still free */
                    "\tnop             \n"  /* MODULO-switch           */

                    /* last process has entered barrier => leave barrier */

                    "\tsyncadd %2, %1, 0\n" /* decrement semaphore     */

                    : "=&r" (dummy1)     /* %0 - rp */
                    : "r" (&(b->sema)),  /* %1 */
                      "r" (-1)           /* %2 */
                    : "cc", "memory"
                   );
   }
   else
   /* last process entering barrier */
   {
      if(f != NULL)   
      /* process executes function specified as parameter */
      {
         f(farg);
      }

      /* now process can reset counter and lock semaphore */

      asm volatile ("bms 0             \n"
                    "\tstg %3, %0, 0   \n"  /* counter reset  */
                    "\tstg %2, %1, 0   \n"  /* lock semaphore */
                    :
                    : "r" (&(b->count)), /* %0 */
                      "r" (&(b->sema)),  /* %1 */
                      "r" (nprocs-1),    /* %2 */
                      "r" (0)            /* %3 */
                    : "cc", "memory"
                   );

      /* process leaves barrier now */
   }

} /* sbp_barrier */

/* =============== perfectly synchronizing barrier =============== */

/* this mechanism is based in J. Keller's synchronization */

/*
 * sbp_sync_barrier_t
 */

typedef struct
{
   int count1;
   int count2;

} sbp_sync_barrier_t;

/*
 * sbp_sync_barrier_init()
 */

EXTERN_INLINE void sbp_sync_barrier_init(sbp_sync_barrier_t *sb)
{
   sb->count1 = 0;
   sb->count2 = 0;
}

/*
 * sbp_sync_barrier()
 */

EXTERN_INLINE void sbp_sync_barrier(sbp_sync_barrier_t *sb, int nprocs)
{

   int dummy; /* to make it work correctly on compiler-optimizations */

   asm volatile("bmc 0              \n"
                "\tsyncadd %3, %1, 0\n" /* increment count1          */
                "\tldg %1, 0, %0    \n" /* dummy = count1            */
                "\tnop              \n" /* delay slot                */
                "\tsub %0, %4, r0   \n" /* test count1 == nprocs     */
                "\tbne -3           \n" /* count1 != nprocs ==> loop */
                "\tsyncadd %3, %2, 0\n" /* increment count2          */
                "\tldg %2, 0, %0    \n" /* dummy = count2            */
                "\tnop              \n" /* delay slot                */
                "\tsub %0, %4, r0   \n" /* test count2 == nprocs     */
                "\tbeq 3            \n" /* slow proc ==> continue    */
                "\tnop              \n" /* fast proc ==> 2 nops      */
                "\tnop              \n"

                /* now the "nprocs" processes are synchronous */

                "\tstg %5, %1, 0    \n" /* reset count1 */
                "\tstg %5, %2, 0    \n" /* reset count2 */

                : "=&r" (dummy)                   /* %0 */
                : "r"   (&(sb->count1)),          /* %1 */
                  "r"   (&(sb->count2)),          /* %2 */
                  "r"   (1),                      /* %3 */
                  "r"   (nprocs),                 /* %4 */
                  "r"   (0)                       /* %5 */
                : "cc", "memory"
               );

} /* sbp_sync_barrier */

/* ############### parallel loop ############### */

/*
 * sbp_parloop_t
 */

typedef struct
{
   int index;
   sbp_barrier_t sync;
} sbp_parloop_t;

/*
 * sbp_parloop_init()
 */

EXTERN_INLINE void sbp_parloop_init(sbp_parloop_t *pl)
{
   pl->index = 0;
   sbp_barrier_init(&(pl->sync));

} /* sbp_parloop_init */
 
/*
 * sbp_parloop()
 */

void sbp_counter_set(void *counter_and_value);
	/* this function will be called during synchronization to reset the
	   loop index */

EXTERN_INLINE int sbp_parloop(sbp_parloop_t *pl, int max,
                                                  int nprocs, int stride)
{
   int oldind;
   unsigned int cnt_set[2]; /* used to reset counter */
	
   if((oldind = sbp_mpadd_m1(&(pl->index), stride)) > max)
   /* oldind > max ==> loop end, synchronize processes, reinitalize pl->index
      and assign -1 to oldind */
   {
	   cnt_set[0] = (unsigned int)(&(pl->index));
	   cnt_set[1] = (unsigned int)0;

      sbp_barrier(&(pl->sync), nprocs, sbp_counter_set, (void *) cnt_set);

      oldind = -1;
   }

   return oldind;

} /* sbp_parloop */

/* ############# calculation of a unique id in a process group  ############ */

EXTERN_INLINE unsigned int sbp_get_id_in_group(unsigned int procs_in_group,
											   unsigned int *counter,
											   sbp_barrier_t *barrier)
{
	unsigned int id;
	unsigned int cnt_set[2]; /* used to reset counter */

	/* reset counter - just to be sure */

	cnt_set[0] = (unsigned int)counter;
	cnt_set[1] = (unsigned int)0;

	sbp_barrier(barrier, procs_in_group, sbp_counter_set, (void *)cnt_set);

	/* get id */

	id = sbp_mpadd(counter, 1);

	/* synchronize processes */

	sbp_barrier(barrier, procs_in_group, NULL, NULL);
	
	return id;
	
} /* sbp_get_id_in_group() */


/* ############ parallel queue/broadcast queue, parallel stack ############ */

#define log2_floor(value)\
({\
   __typeof__(value) ___rvalue___;\
   asm volatile("rm \t%1, %0\n"\
		: "=&r" (___rvalue___)\
		: "r"   (value)\
		: "cc"\
	      );\
   ___rvalue___;\
})

/*
 * maximum queue size
 *
 */

#define SBP_MAX_PQ_LOG 13 /* there are 2^SBP_MAX_PQ_LOG lists */ 
#define SBP_MAX_BQ_LOG 13 /* there are 2^SBP_MAX_BQ_LOG lists */ 
#define SBP_MAX_ST_LOG 13 /* there are 2^SBP_MAX_ST_LOG lists */ 

/*
 * dummy queue/stack element
 *
 */

/* the definition of "sbp_pq_ele", "sbp_bq_ele" and "sbp_st_ele" must be
   identical (modulo different names for the struct items */

struct __pq_listhd__; /* defined below */
struct __bq_listhd__; /* defined below */
struct __st_listhd__; /* defined below */

typedef struct __sbp_pq_ele__
{
   /* the first four components must correspond to the first four components
      of "sbp_pq_listhd" */

   struct __pq_listhd__  *head;    /* pointer to head of list
									  stg/ldg on MODULO 0/1 */
   struct __sbp_pq_ele__ *next;    /* to build a doubly linked list */
   struct __sbp_pq_ele__ *prev;    /* to build a doubly linked list */
   unsigned int           missing; /* number of removed items at the right
									  of the element */
   unsigned int    dummy1, dummy2; /* only used in "sbp_bq_ele" */
} sbp_pq_ele;

typedef struct __sbp_bq_ele__
{
   /* the first three components must correspond to the first three components
      of "sbp_bq_listhd" */

   struct __bq_listhd__  *head;       /* pointer to head of list
										 stg/ldg on MODULO 0/1 */
   struct __sbp_bq_ele__ *next;       /* to build a doubly linked list */
   struct __sbp_bq_ele__ *prev;       /* to build a doubly linked list */
   unsigned int	          id;         /* identificator of stored element */
   unsigned int           read_start; /* decremented when a process starts
										 to read the element
										 access only by mpadd */
   unsigned int           read_end;   /* decremented when the process has
										 finished to copy the element
										 syncadd_m0/ldg_m1 */
} sbp_bq_ele;

typedef struct __sbp_st_ele__
{
   /* the first three components must correspond to the first three components
      of "sbp_st_listhd" */

   struct __st_listhd__  *head;    /* pointer to head of list
									  stg/ldg on MODULO 0/1 */
   struct __sbp_st_ele__ *next;    /* to build a linked list */
   struct __sbp_st_ele__ *dummy1;  /* only used in "sbp_pq/bq_ele" */
   unsigned int           dummy2;  /* only used in "sbp_pq/bq_ele" */
   unsigned int   dummy3, dummy4;  /* only used in "sbp_bq_ele" */

} sbp_st_ele;

/*
 * SBP_PQ_TYPEDEF/SBP_BQ_TYPEDEF/SBP_ST_TYPEDEF
 *
 * - the first six items correspond to the six items
 *   of "sbp_pq_ele"/"sbp_bq_ele"/"sbp_st_ele"
 *
 * - the three macros are interchangeable
 *
 */

#define SBP_PQ_TYPEDEF(typename, items...)\
\
typedef struct typename\
{void *___head___, *___next___, *___prev___;\
 unsigned int ___missing___, ___dummy1___, ___dummy2___;\
 items\
} typename;

#define SBP_BQ_TYPEDEF(typename, items...)\
\
typedef struct typename\
{void *___head___, *___next___, *___prev___;\
 unsigned int ___id___, ___read_start___, ___read_end___;\
 items\
} typename;

#define SBP_ST_TYPEDEF(typename, items...)\
\
typedef struct typename\
{void *___head___, *___next___, *___dummy1___;\
 unsigned int ___dummy2___, ___dummy3___, ___dummy4___;\
 items\
} typename;

/*
 * head of a list of queued/stacked elements
 *
 */

struct __sbp_parallel_queue__; /* defined below */

typedef struct __pq_listhd__
{
	/* the first four components must correspond to the first four components
	   of "sbp_pq_ele" */

	struct __sbp_parallel_queue__ *queue; /* only read access */
	sbp_pq_ele	    *first;  /* to build a doubly linked list
							    access protected by lock */
	sbp_pq_ele	    *last;   /* to build a doubly linked list
								access protected by lock */
	int		     missing;    /* number of removed itmes at the head of
								the list;
								access protected by lock */
	sbp_fair_lock_t   list_lock;
	unsigned int      read_count;  /* syncadd_m0/ldg_m1 */
	unsigned int      write_count; /* syncadd_m0/ldg_m1 */

} sbp_pq_listhd;


struct __sbp_broadcast_queue__; /* defined below */

typedef struct __bq_listhd__
{
	/* the first three components must correspond to the first three components
	   of "sbp_bq_ele" */

	struct __sbp_broadcast_queue__ *queue; /* only read access */
	sbp_bq_ele	    *first;                /* to build a doubly linked list
											  can not be written and read
											  at the same time normally */
	sbp_bq_ele	    *last;                 /* to build a doubly linked list
											  can not be written and read
											  at the same time normally */
	sbp_rw_lock_t    list_lock;
	unsigned int     write_count;          /* syncadd_m0/ldg_m1 */

} sbp_bq_listhd;


struct __sbp_stack__; /* defined below */

typedef struct __st_listhd__
{
	/* the first three components must correspond to the first three components
	   of "sbp_st_ele" */

	struct __sbp_stack__ *queue;  /* only read access */
	sbp_st_ele	         *first;  /* to build a linked list
									 can not be written and read
									 at the same time normally */
	sbp_st_ele	         *dummy;  /* not used */
	unsigned int     round_count; /* counts rounds for this list within
									 a push- or pop-phase
									 syncadd_m0/ldg_m1 */

} sbp_st_listhd;


/*
 * parallel queue/broadcast queue/parallel stack
 *
 */

typedef struct __sbp_parallel_queue__
{
	unsigned int q_size;       /* only read access */
	int min_num_ele;           /* tdr_m1 & mpadd_m1/syncadd_m0 */
	unsigned int read_count;   /* only access by mpadd */
	unsigned int write_count;   /* only access by mpadd */
	sbp_pq_listhd *listarray;  /* only read access */

} sbp_pq;


typedef struct __sbp_broadcast_queue__
{
	unsigned int q_size;        /* only read access */
	unsigned int num_procs;     /* number of processors thatt use this queue;
								   only read access */
	int min_num_ele;            /* syncadd_m0/ldg_m1 */
	unsigned int write_count;   /* only access by mpadd */
	unsigned int *read_count;   /* holds the id of the next element
								   the process wants to read;
								   there's one counter for each processor */
	sbp_bq_listhd *list_array;  /* array of lists of queued elements
								   only read access */

} sbp_bq;


typedef struct __sbp_stack__
{
	unsigned int s_size;        /* only read access */
	unsigned int s_pointer;     /* stack pointer - only accessed via mpadd */
	int min_num_ele;            /* minimal number of stored elements
								   tdr_m1 & mpadd_m1/syncadd_m0 */
    sbp_gr_fair_lock_t pp_lock;/* lock for separatimg push- and pop-phases */
	sbp_st_listhd *listarray;   /* only read access */

} sbp_st;


/*
 * function prototypes
 *
 */

void sbp_par_copy_1_n(unsigned int *source_adr, unsigned int copy_len,       
                      unsigned int **destination_adr, unsigned int num_dest, 
                      unsigned int num_procs,                                
                      shared unsigned int *counter,                          
                      shared sbp_barrier_t *barrier);

int sbp_pq_init(sbp_pq *pq, unsigned int queuesize);
int sbp_pq_par_init(sbp_pq *pq, unsigned int queuesize,
					sbp_barrier_t *barrier, unsigned int num_init_procs);
int sbp_pq_noe(sbp_pq *pq);
int sbp_pq_read(sbp_pq *pq, void *ele_pt_pt);
void sbp_pq_write(sbp_pq *pq, void *ele_pt);
int sbp_pq_remove(sbp_pq *pq, void *ele_pt);

int sbp_bq_init(sbp_bq *bq, unsigned int queuesize,
				unsigned int procs_in_group);
int sbp_bq_par_init(sbp_bq *bq, unsigned int queuesize,
				unsigned int procs_in_group,
				sbp_barrier_t *barrier, unsigned int num_init_procs);
int sbp_bq_noe(sbp_bq *bq, unsigned int id_in_group);
int sbp_bq_read(sbp_bq *bq, void *ele_pt_pt, void *(*ele_copy)(void *),
				unsigned int id_in_group);
void sbp_bq_write(sbp_bq *bq, void *ele_pt);

int sbp_st_init(sbp_st *st, unsigned int stacksize);
int sbp_st_par_init(sbp_st *st, unsigned int stacksize,
					sbp_barrier_t *barrier, unsigned int num_init_procs);
int sbp_st_noe(sbp_st *st);
int sbp_st_pop(sbp_st *st, void *ele_pt_pt);
void sbp_st_push(sbp_st *st, void *ele_pt);


/* inline functions */

EXTERN_INLINE int sbp_pq_noe(sbp_pq *pq)
{
	return sbp_mpadd_m1(&(pq->min_num_ele), 0);

} /* sbp_pq_noe() */


EXTERN_INLINE int sbp_bq_noe(sbp_bq *bq, unsigned int id_in_group)
{
	return (sbp_mpadd_m1(&(bq->min_num_ele), 0)
			- bq->read_count[id_in_group]);

} /* sbp_bq_noe() */


EXTERN_INLINE int sbp_st_noe(sbp_st *st)
{
	return sbp_mpadd_m1(&(st->min_num_ele), 0);

} /* sbp_st_noe() */

#endif /* ifdef(_SBP_MULTIPROC_H_) */

