/* ForkLight Compiler (C) 1998 C.W. Kessler
 *
 * Target-independent implementations of PCODE and service routines
 * and
 * Target-machine dependent library based on P4
 * (shared memory access, startup, etc.)
 * to be activated by compiler option -DP4
 * Hardware-specific implementations of shared memory access,
 * shmalloc(), optimized spawning, atomic primitives 
 * (fetch_op(), atomic_op()) to be activated by indicating the
 * corresponding hardware, e.g. -DSBPRAM
 */

#ifdef SBPRAM
#ifndef P4
#define P4
#endif
#endif

#ifdef P4
#include "p4.h"
#endif
#include "libfl.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define TRACEBARRIER 0
#define BARRIEROUTOFTIME 6000000  // about 10 seconds
// ^^^ only for test purposes. For longer program runs
// this value must be set to infinity. 
#define TRACEFRAMES 0
#define TRACESHREADS 0
#define TRACESHWRITES 0
#define TRACEATOMICACCESS 0
#define TRACEPRMEMACCESS 0
#define TRACEPCODE 0
#define ALONEOPTISTURNEDON 0   // vorsicht bei _barrier(_gpp[0]...)

unsigned int __P__ = 1;    /* total number of threads (static) */
                           /* access via PhysThreadId() only */
int _alone = 0;     // counter: >=1 if the group has only this one proc.
                    // # ignored frames = _alone - 1
int _rootgroup = 0; // remains 1 as long as the initial group has
                    // not been split up into more than one subgroup
                    // kracht beim privaten if!
int _NROFSUBGROUPS; /* transport of information */
int _SLICESIZE;     /* in words */

static union { int i; float f; void *vp;} _converter;
void *_itovp( int x ) { _converter.i = (x>>2)<<2; return _converter.vp; }
void *_ftovp( float x ) { _converter.f = x; return _converter.vp; }

/* the stack pointers: */
void **_gpp, **_gps, **_eps, **_sps; 

static void * _pstack[_PSTACKSIZE];
void **_spp = _pstack;

       void **_sstack;
       void **_gsstack;
static void **_scratch;
void *_temp[_TEMPSTACKSIZE];

int _e;
#define eprintf printf("P%d:",_PhysThreadId());\
                for(_e=0;_e<_PhysThreadId();_e++) printf("         ");\
                printf
#define efprintf printf("P%d:",_PhysThreadId());\
                for(_e=0;_e<_PhysThreadId();_e++) printf("         ");\
                fprintf

void _FatalError( char *str )
{
 efprintf( stderr, "### FATAL ERROR: \n" );
 efprintf( stderr, str );
 _endparallelsection( 0 );
 exit(1);
}


/* handling the private stack 
 * (mainly procedure parameters and private group frames): */

void _push( void *x, size_t sz )
{
#if TRACEPRMEMACCESS
 eprintf(" _push(%d,%d), _spp=%d\n",(int)x,sz,(int)_spp);
 if (sz<=0) {
     char sbuf[64];
     sprintf(sbuf, "Size %d not permitted.\n", (int)sz);
     _FatalError(sbuf);
 }
#endif
 if (_spp+sz >= _pstack+(_PSTACKSIZE/4))
     _FatalError("private stack overflow!\n");
 if (sz <= _VOIDPTYPESIZE) {  /* one word */
   *_spp++ = x;
 }
 else {  /* double etc. */
     memcpy( (void *)_spp,   x,  sz );
       /*    target src size   */
     _spp += (sz+3)/_VOIDPTYPESIZE;
   }
}


void _pop( void **retptr, size_t sz )
{
#if TRACEPRMEMACCESS
 eprintf(" _pop(%x,%d), _spp=%d\n",(int)retptr,sz,(int)_spp);
#endif
  if (_spp-sz < _pstack) _FatalError("private stack underflow!\n");
  if (sz<=_VOIDPTYPESIZE) 
     *retptr = *_spp--;
  else /*(sz>4)*/ {
     _spp -= (sz-3)/_VOIDPTYPESIZE;
     memcpy( (void *)retptr, _spp, sz );
  }
}


void _top( void **retptr, size_t sz )
{
#if TRACEPRMEMACCESS
 eprintf(" _top(%x,%d)\n", (int)retptr, sz );
 if (sz<=0) {
     char sbuf[64];
     sprintf(sbuf, "Size %d not permitted.\n", sz);
     _FatalError(sbuf);
 }
#endif
 if (sz<=_VOIDPTYPESIZE) 
     *retptr = *_spp;
 else
     memcpy( retptr, (void *)_spp, sz );
}


/* handling the shared stack 
 * (group frames, synchronization, shared local variables): */

void _SMpush( void *x, size_t sz )
{
#if TRACESHWRITES
 eprintf(" _SMpush(%d,%d) -> ",(int)x,sz);
 if (sz <= 0) {
     char sbuf[64];
     sprintf(sbuf, "Size %d not permitted.\n", (int)sz);
     _FatalError(sbuf);
 }
#endif
 if (_sps+sz >= _eps)
    _FatalError("shared stack overflow!\n");
 _SMwrite( _sps, x, _VOIDPTYPESIZE );
 _sps+= (sz+(_VOIDPTYPESIZE-1))/_VOIDPTYPESIZE;
             /* round upwards. SUN_SOLARIS: Char=1, others: multiples of 4 */
}


void _SMpop( void **retptr, size_t sz )
{
#if TRACESHREADS
 eprintf(" _SMpop(%x,%d) -> ",(int)retptr,sz);
  if (sz <= 0) {
     char sbuf[64];
     sprintf(sbuf, "Size %d not permitted.\n", sz);
     _FatalError(sbuf);
  }
#endif
  if (_sps-sz < _sstack) _FatalError("shared stack underflow!\n");
  _sps -= (sz+(_VOIDPTYPESIZE-1))/_VOIDPTYPESIZE;
  _SMread( retptr, _sps, sz );
}


void _SMtop( void **retptr, size_t sz )
{
#if TRACESHREADS
 eprintf(" _SMtop(%x,%d) -> ",(int)retptr,sz);
  if (sz<=0) {
     char sbuf[64];
     sprintf(sbuf, "Size %d not permitted.\n", sz);
     _FatalError(sbuf);
  }
#endif
  _SMread( retptr, _sps, sz );
}


#define _shgrframesize 4 /*words*/
#define _makeshframe( firstSM, pinit ) \
{ \
  _gps = firstSM;\
  _SMwrite( firstSM, pinit, _VOIDPTYPESIZE );\
  _SMwrite( firstSM + 1, 0, _VOIDPTYPESIZE );\
  _SMwrite( firstSM + 2, 0, _VOIDPTYPESIZE );\
  _SMwrite( firstSM + 3, pinit, _VOIDPTYPESIZE );\
  _sps = firstSM + _shgrframesize; \
}

void printgroupframes( void )
{
 eprintf("shar.grframe: _gps=%x ->( %d, %d, %d, %d ) csc=%d alone=%d\n", _gps, 
        _gps[0], _gps[1], _gps[2], _gps[3], _gpp[5], _alone );
 eprintf("priv.grframe: _gpp=%x ->( %x %x %x %x %d csc=%d @=%d $=%d )\n",
        _gpp, _gpp[0], _gpp[1], _gpp[2], _gpp[3], _gpp[4], _gpp[5], _gpp[6], _gpp[7] );
}
 

void _barrier( void **thegpp, void **thegps )
{
  int csc, next;
  volatile int numpr;
  int counter = 0;
  if (_alone) {
#if TRACEBARRIER || TRACEPCODE
     eprintf("skipped _barrier( thegpp=%x, thegps=%x ), alone=%d\n",
             thegpp, thegps, _alone);
#endif
     return;
  }
  csc = (int) thegpp[5];
#if TRACEBARRIER || TRACEPCODE
  eprintf("_barrier( thegpp=%x, thegps=%x, csc=%d )\n", thegpp, thegps, csc);
#endif
  next = csc + 1;
  if (next > 2) next = 0;
  atomic_add( thegps + next, 1 );
  atomic_add( thegps + csc, -1 );
  do {      /*busy wait loop*/
     _SMread( (void **)&numpr, thegps+csc, _VOIDPTYPESIZE );
#if TRACEBARRIER
     if (!counter % 10000)
        {eprintf("waiting in barrier...%d\n", numpr);}
#endif
     counter++;
     if (counter > BARRIEROUTOFTIME)
       _FatalError("  _barrier: time overflow!\n");
  } while (numpr != 0);
  ((int *)thegpp)[5] = next; 
#if TRACEBARRIER
  eprintf(" leaving _barrier, new csc=%d\n", thegpp[5] );
#endif
}


inline void barrier( void ) { _barrier( _gpp, _gps ); }


extern int groupsize( void )  /*inspect sync cell*/
{
  void *ret;
  _SMread( &ret, _gps+3, _VOIDPTYPESIZE);
#if TRACEPCODE
  eprintf("groupsize() -> %d\n", (int)ret);
#endif
  return (int) ret;
}


inline int _shavail( void ) 
{
#if TRACEPCODE
  eprintf("_shavail() ->%x - %x = %d\n", _eps, _sps, (int)(_eps - _sps) * _VOIDPTYPESIZE);
#endif
  return (int)(_eps - _sps) * _VOIDPTYPESIZE; 
}

void *shalloc( size_t k )
{
  _eps = _eps - (k+(_VOIDPTYPESIZE-1))/_VOIDPTYPESIZE;
#if TRACEPCODE
  eprintf("shalloc( %d ) -> new eps = %x\n", k, _eps);
#endif
  if (_eps <= _sps)
     _FatalError("shalloc: Automatic Shared Heap Overflow!\n");
  return (void *) _eps;
}


void _makegroupframe( int numberofgroups )
/* allocate a private group frame on the private stack;
 * initialize the new subgroup's @ and $ to their previous values,
 * so they can be used as operands in the fork expressions
 */
{
  int slicesize = _eps - _sps;
  int oldgid, oldpid;
#if TRACEPCODE
  eprintf("_makegroupframe(%d); available: %d words. _alone=%d\n",
          numberofgroups, slicesize, _alone );
#endif
  if (numberofgroups > 1)  _rootgroup = 0;  // ab jetzt keine mehr
  // das kann noch optimiert werden, falls die Rootgroup im
  // weiteren Verlauf nochmal aktiv und wieder aufgespalten wird.
  assert( _gpp );
  if (_alone) {   // Sparframe: nur @ retten, $ bleibt 0 da _alone
                  // ignoredframes will be incremented in _entergroup()
     _push( _gpp[6], _VOIDPTYPESIZE );
     return;      // frames will be ignored.
  }
  oldgid = ((int *)_gpp)[6];
  oldpid = ((int *)_gpp)[7];
  if (numberofgroups < 1)
     _FatalError("fork(): Number of groups is non-positive!\n");
  slicesize = slicesize / numberofgroups;
  if (slicesize < _MINSSTACKSIZE) 
     _FatalError("Shared Stack Overflow!\n");
  _push( _gpp, _VOIDPTYPESIZE );
  _gpp = _spp - 1;
  _push( _sps, _VOIDPTYPESIZE );
  _push( _eps, _VOIDPTYPESIZE );
  _push( _gps, _VOIDPTYPESIZE );
  _push( 0, _VOIDPTYPESIZE );    /*vormals ignoredframes*/
  _push( 0, _VOIDPTYPESIZE );    /*csc*/
  _push( (void *)oldgid, _VOIDPTYPESIZE );    /*copy @*/
  _push( (void *)oldpid, _VOIDPTYPESIZE );    /*copy $*/
  _SLICESIZE = slicesize;   /*will be used in _entergroup()*/
  _NROFSUBGROUPS = numberofgroups;   /*will be used in _entergroup()*/
}

#define _prgrframesize 8 /*words*/


void _entergroup( int klammeraffe )
 /* become member of subgroup ^^
  * precondition: makegroupframes has been executed before 
  * precondition: klammeraffe is in the range [0..numberofgroups-1]
  *               (guaranteed by ENGRPV in gen.c)
  * executes _entergroup for the same subgroup.
  */
{
  /* csc==0*/
  void **firstSM;
  int myrank;
  int mysize;
#if TRACEPCODE
  eprintf("_entergroup(%d), _alone=%d, _rootgroup=%d\n",
           klammeraffe, _alone, _rootgroup );
#endif
  if (_alone) {  /* no shared frame needs be allocated */
     _alone ++;  /*incr. #ignored frames */
     return;
  }
  _gpp[6] = (void *) klammeraffe;
  firstSM = &(_sps[_SLICESIZE * klammeraffe]);
#if TRACEPCODE
  eprintf(" _SLICESIZE:%d, firstSM:%x\n", _SLICESIZE, firstSM);
#endif
  // Case 1: _rootgroup  -> using scratch array saves one barrier!
  if (_rootgroup) {
    assert(0);
    myrank = fetch_add( &(_scratch[klammeraffe]), 1 );
    _gpp[7] = (void *) myrank;
    /* initialize sync cells for new subgroups: */
    /* spaeter kann hier optimiert werden... */
    if (myrank==0) { // group leader
#if TRACEPCODE
        eprintf("I am leader, build sh grframe\n");
#endif
        _makeshframe( firstSM, 0 )    /*setzt auch _sps, _gps*/
    }
    else {
        _gps = firstSM;
        _sps = firstSM + _shgrframesize;
    }
    _barrier(_gpp[0], _gpp[3]);  /* first barrier on parent group */
    // now the new frame exists,
    // and scratch[0..g-1] contains the final subgroup sizes 
    _SMread( (void **)&mysize, &(_scratch[klammeraffe]), _VOIDPTYPESIZE );
    if (myrank==0) { // single SMwrites cheaper than another fetch_add
       _SMwrite( &(firstSM[0]), (void *)mysize, _VOIDPTYPESIZE );  // set group size cell
       _SMwrite( &(firstSM[3]), (void *)mysize, _VOIDPTYPESIZE );  // set group size cell
       _SMwrite( &(_scratch[klammeraffe]), 0, _VOIDPTYPESIZE );  // reset scratch cell
    }
    _barrier(_gpp[0],_gpp[3]);  /* second barrier on parent group */
    /* now firstSM[0]..[3] is valid */
    _eps = &(firstSM[_SLICESIZE - 1]);
  }
  // -- Case 2: -------------------------------------------
  else {  // not _rootgroup: the group has been truly split
          // -> scratch array cannot be safely reused in parallel
    _SMwrite( firstSM, 0, _VOIDPTYPESIZE );    // init sync cell
    // ^ multiple write laesst sich leider hier nicht vermeiden.
    _barrier(_gpp[0], _gpp[3]);  /* first barrier on parent group */
    // now the first sync cell is guaranteed to be 0
    myrank = fetch_add( firstSM, 1 );
    _gpp[7] = (void *) myrank;
    /* initialize sync cells for new subgroups: */
    /* spaeter kann hier optimiert werden... */
    if (myrank==0) { // group leader
#if TRACEPCODE
       eprintf("I am leader, build sh grframe (Case 2)\n");
#endif
       _gps = firstSM;
       _SMwrite( &(firstSM[1]), 0, _VOIDPTYPESIZE );
       _SMwrite( &(firstSM[2]), 0, _VOIDPTYPESIZE );
       _sps = firstSM + _shgrframesize;
    }
    else {
       _gps = firstSM;
       _sps = firstSM + _shgrframesize;
    }
    _barrier(_gpp[0], _gpp[3]);  /* second barrier on parent group */
    // now the new frame exists,
    // and _gps[0] contains the final subgroup sizes 
    _SMread( (void **)&mysize, &(firstSM[0]), _VOIDPTYPESIZE );
    if (myrank==0) { // single SMread/writes cheaper than another fetch_add
       _SMwrite( &(firstSM[3]), (void *)mysize, _VOIDPTYPESIZE );// set group size cell
    }
    _barrier(_gpp[0],_gpp[3]);  /* third barrier on parent group */
    // now firstSM[0]..[3] is valid
    _eps = &(firstSM[_SLICESIZE - 1]);
  } // end else Case 2: not _rootgroup
#if ALONEOPTISTURNEDON
  if (mysize==1) _alone = 1;
    // das kann ja nur passieren, falls _alone vorher 0 war.
#endif
#if TRACEFRAMES
  printgroupframes();
#endif
}


void _leavegroup( void )  /* cancel membership of the current group */
{
 if (_alone > 1 ) {  // # ignored frames > 0
#if TRACEPCODE
    eprintf("_leavegroup(), decr. _alone %d -> %d\n", _alone, _alone-1 );
#endif
    _alone --; if (_alone==1) _alone = 0;
    assert( _spp > _gpp + _prgrframesize );
    _pop( &(_gpp[6]), _VOIDPTYPESIZE );   // restore old @ from Sparframe
 }
 else {
#if TRACEPCODE
    eprintf("_leavegroup(), remove frame\n");
#endif
  atomic_add( &(_gps[3]), -1 );   /*decr. groupsize*/
  atomic_add( &(_gps[(int)_gpp[5]]), -1 ); /*decr. current sync cell*/
  _gps = _gpp[3];
  _eps = _gpp[2];
  _sps = _gpp[1];
  _spp = _gpp; 
  _gpp = _gpp[0];
  _alone = 0;
 }
#if TRACEFRAMES
 printgroupframes();
#endif
}

void _leaveprivategroup( void )  /* strip off the private group frame*/
{
 if (_alone > 1 ) {  // # ignored frames > 0
#if TRACEPCODE
        eprintf("_leavepriategroup(), decr. _alone %d -> %d\n", _alone, _alone-1);
#endif
        _alone --; if (_alone==1) _alone = 0;
        assert( _spp > _gpp + _prgrframesize );
        _pop( &(_gpp[6]), _VOIDPTYPESIZE );   // restore old @ from Sparframe
 }
 else {
#if TRACEPCODE
  eprintf("_leaveprivategroup(), remove private frame\n");
#endif
  _spp = _gpp; 
  _gpp = _gpp[0];
 }
}

/* fuer join() benoetigt: */

void **__SM;    /* contains ptrs to shmemory bus sections */
int *__gone;    /* guards access to _ticket */
int *__ticket;  /* semaphore to control entry of passengers */


void _allocsharedglobals( size_t k )
{
  /* at this point of time, the parallel procs have just been generated,
   * so all will execute this before using any global variable.
   */
#if TRACEPCODE
  eprintf("_allocsharedglobals( %d )\n", k );
#endif
  if ( k >= _shavail() )
     _FatalError("Not enough shared heap space for the shared globals!\n");
  _gsstack = (void **)shalloc( k );
}


void _InitSharedStack( unsigned int numprocs )
{
 /*precondition: _sstack has been allocated*/
 int i; 
 _eps = &(_sstack[_SSTACKSIZE/_VOIDPTYPESIZE]);    /* group-local shared heap pointer */

/* was: void _init_joins( void )  */
/* to be called by all processors, as early as possible. */
 __SM = _sstack;
 __gone = (int *)&(_sstack[_MAXJOINS]);
 __ticket = (int *)&(_sstack[2*_MAXJOINS]);
 _sps = &(_sstack[3*_MAXJOINS]);
 for (i=0; i<_MAXJOINS; i++) {
     _SMwrite( (void **)&(__gone[i]), 0, _VOIDPTYPESIZE );
     _SMwrite( (void **)&(__ticket[i]), 0, _VOIDPTYPESIZE );
 }
 if (_PhysThreadId()==0)
    _makeshframe( _sps, (void *) numprocs )
 else {
    _gps = _sps;
    _sps = _sps + _shgrframesize;
 }
}

void _InitPrivateStack( unsigned int numprocs )
{
  _gpp = _pstack;
  /*set up initial private group frame: */
  _push( 0, _VOIDPTYPESIZE );       /* _gpp=0, no previous private group frame */
  _push( 0, _VOIDPTYPESIZE );       /* _sps */
  _push( 0, _VOIDPTYPESIZE );       /* _eps */
  _push( 0, _VOIDPTYPESIZE );       /* _gps */
  _push( 0, _VOIDPTYPESIZE );       /* ignoredframes */
  _push( 0, _VOIDPTYPESIZE );       /* csc */
  _push( 0, _VOIDPTYPESIZE );       /* @ := 0 */
  _push( (void *)_PhysThreadId(), _VOIDPTYPESIZE );  /* _gpp[2] = $ := _PhysProcId();*/
}

void _SMdump( void )
{
 void **p;
 eprintf("---------Shared Memory:---------\n");
 for (p=_sps; p>=_sstack; p--) {
    int k;
    _SMread( (void **)&k, p, _VOIDPTYPESIZE );
    eprintf("%d # %d\n", p, k);
 }
 eprintf("--------------------------------\n");
}


void *shmalloc( size_t k ) 
{
// spaeter hier richtige Speicherverwaltung reinmachen;
// muss garantieren, dass Adressen im shmalloc-Block
// von isshared() als shared erkannt werden
   return _SMalloc( k );
}


/*------------------------------------------------------------------*/
/*  HERE FOLLOWS THE SYSTEM-DEPENDENT PART. */
/*------------------------------------------------------------------*/

#ifdef P4
p4_lock_t *pmemlock;
#endif



void _SMread( void **retptr, void *loc, size_t sz )
// Atomic read. System dependent!
{
 assert( _isshared( (void **)loc ) );
#if TRACESHREADS
 eprintf(" _SMread(%x,%x,%d) ->",(int)retptr,(int)loc,sz);
  if (sz <= 0) {
     char sbuf[64];
     sprintf(sbuf, "Size %d not permitted.\n", sz);
     _FatalError(sbuf);
  }
#endif
  if (sz<=_VOIDPTYPESIZE) {
#ifdef SBPRAM
     *retptr = sbp_mpadd_m0( (void **)loc, 0 );
#else
     *retptr = *(void **)loc;
#endif
#if TRACESHREADS
     eprintf("%x\n", *retptr);
#endif
  }
  else {   /*oder explicitely ausprogrammieren mit target sm read*/
#ifdef P4
     p4_lock( pmemlock );
#endif
     memcpy( retptr, loc, sz );
#ifdef P4
     p4_unlock( pmemlock );
#endif
#if TRACESHREADS
     eprintf("%x\n", *retptr);
#endif
  }
}


extern void _SMwrite(void **loc, void *val, size_t sz)
// Atomic write. System dependent!
// val is a value if sz<=4 and a pointer to the value if sz > 4
{
 assert( _isshared( loc ) );
#if TRACESHWRITES
 eprintf(" _SMwrite(%x,%x,%d)\n",(int)loc, (int)val, sz);
 if (sz<=0) {
    char sbuf[64];
    sprintf(sbuf, "Size %d not permitted.\n", (int)sz);
    _FatalError(sbuf);
 }
#endif
 if (sz <= _VOIDPTYPESIZE)   /* one word on sequential architecture */
#ifdef SBPRAM
   sbp_stg_m1( loc, val );
#else
   *loc = val;
#endif
 else {  /* ggf. mit Schleife und native sm-write ausprogrammieren */
#ifdef P4
     p4_lock( pmemlock );
#endif
     memcpy( loc,   val,  sz );
#ifdef P4
     p4_unlock( pmemlock );
#endif
       /*    target src size   */
   }
}


int fetch_add( void **loc, int i )
// system dependent!
{
  int ret;
  assert( _isshared( loc ) );
#ifdef SBPRAM
  ret = (int) sbp_mpadd( loc, (void *)i );  
#else
#ifdef P4
  p4_lock( pmemlock );
  _SMread( (void *)&ret, loc, _VOIDPTYPESIZE );
  _SMwrite( loc, (void *)(ret + i), _VOIDPTYPESIZE );
  p4_unlock( pmemlock );
#else
  assert(0);
  *((int *)loc) += i;
#endif
#endif
#if TRACEATOMICACCESS
  eprintf(" fetch_add(%x,%d) -> %d\n", (int)loc, i, ret);
#endif
  return ret;
}
 
void atomic_add( void **loc, int i )
// Atomic increment for 32bit words (int). System dependent!
{  
  assert( _isshared( loc ) );
#if TRACEATOMICACCESS
  eprintf(" atomic_add(%x,%d)\n", (int)loc, i);
#endif
#ifdef SBPRAM
  sbp_mpadd_m1( loc, i );   // ignore return value.
#else
#ifdef P4
  p4_lock( pmemlock );
  *loc = *loc + i;
  p4_unlock( pmemlock );
#else
  *loc = *loc + i;
#endif
#endif
}

int _PhysThreadId( void ) 
// system dependent!
{
#ifdef P4
  return p4_get_my_id();
#else
  return 0;     /*sequential for the first*/
#endif
}

void *_SMalloc( size_t sz )
// system dependent!
{
  void *ret;
#ifdef P4
  ret  = (void *) p4_shmalloc( sz );
#else
  ret  = (void *) malloc( sz );
#endif
#if TRACEPCODE
  eprintf("_SMalloc( %d ) -> %x\n", sz, ret );
#endif
  return ret;
}


void _beginparallelsection( unsigned int numprocs )
// sequential part of initialization
// and call the parallel part and _main()
// system dependent!
{
  int i;
  FILE *pgfp;
 
  setbuf( stdout, NULL );
#if TRACEPCODE
  eprintf("_beginparallelsection( %d )\n", numprocs);
#endif
  __P__ = numprocs;
#ifdef P4
  pmemlock = (p4_lock_t *)p4_shmalloc( sizeof(p4_lock_t));
  p4_lock_init( pmemlock );
#endif
  /* initialize shared memory for group stack/heap: */
  _sstack = (void **)_SMalloc( _SSTACKSIZE );
  _InitSharedStack( numprocs );

  /* now spawn numprocs - 1 threads: */
  /* first write a procgroup file: */
#ifdef P4
  if ((pgfp = fopen("procgroup", "w"))==0)
     _FatalError("Cannot write procgroup file\n");
  fprintf(pgfp, "local %d\n", numprocs - 1);
  fclose(pgfp);
  p4_create_procgroup();
#endif

// ex void *_doparallel_main( void ):
// the parallel part of initialization
// precondition: the shared stack has already been initialized
// {
//   extern void main(void);
//   int i;
  _InitPrivateStack( __P__ );
  _gpp[7] = (void *)p4_get_my_id();
  /* initialize scratch space: */
  _scratch = (void **)shalloc( _MAXNROFSUBGROUPS * sizeof( int ) );
  for (i=p4_get_my_id(); i<_MAXNROFSUBGROUPS; i+= __P__ )
     _scratch[i] = 0;
#if TRACEFRAMES
  printgroupframes();
  eprintf("Initialization finished\n");
  p4_global_barrier( 4 );
#endif
#ifndef P4
 _main();
#endif
// return 0;
// }
}

void _endparallelsection( unsigned int numprocs )
// system dependent!
{
  int i;
  void *status;
#if TRACEPCODE
  eprintf("_endparallelsection( %d )\n", numprocs );
#endif
#ifdef P4
  p4_wait_for_end();
#else
  for (i=1; i<numprocs; i++ ) 
     thr_join( tids[i], NULL, &status );
  for (i=0;i<1000000; i++) ;  /*allow printf-messages to appear*/
#endif
#if TRACEPCODE
  if (numprocs >= 1)
    {eprintf("Only one thread active. Program halts.\n");}
#endif
}

