/* forklib2.asm:
 * Source code for the Fork95 run time functions.
 * (c) 1994 - 1999 Christoph W. Kessler
 *
 * To recompile forklib2.o and forklib.o, 
 * run cpp on forklib2.asm first.
 * forklib.o is the variant without profiling code.
 * forklib2.o is generated with -DPROFILING
 */

/* Diese Routinen koennen nicht direkt vom
 * Benutzer aufgerufen werden, weil sie ihre
 * Parameter ueber die Register par1,...,par4
 * bekommen. Aufruf nur ueber den Compiler. */

/* WARNING! The synchronization routine has been optimized
 * for use with the current SBPRAM simulator. When moving to
 * the real hardware, two more nops are required! CWK 960311
 */

#include "forkaliases"
#include "sbo_prozflags.h"

.section ".gpdata", .data
.globl ___PROC_NR__
___PROC_NR__:.int 0
.globl forklib_randomnumber
forklib_randomnumber:.int 1919191            /*default seed*/
.globl __barrcnt     /* counts user barriers of this processor */
__barrcnt:.word 0
.globl __barrWaitTime /* accumulates time spent in barriers by this processor */
__barrWaitTime:.word 0
.globl __myshldgcnt  /* counts user loads of this proc from shared addresses */
__myshldgcnt:.word 0
.globl __myshstgcnt  /* counts user stores of this proc to shared addresses */
__myshstgcnt:.word 0
.globl __mympaddcnt  /* counts user mpadds/syncadds of this proc to shared addresses */
__mympaddcnt:.word 0
.globl __mympmaxcnt  /* counts user mpmaxs/syncmaxs of this proc to shared addresses */
__mympmaxcnt:.word 0
.globl __mympandcnt  /* counts user mpands/syncands of this proc to shared addresses */
__mympandcnt:.word 0
.globl __mymporcnt   /* counts user mpors/syncors of this proc to shared addresses */
__mymporcnt:.word 0
.globl ___TICKETP__   /* points to current join's busTicket counter */
___TICKETP__:.word 0
.globl ___RANKP__   /* points to current join's private busRank variable */
___RANKP__:.word 0

/* ------- runtime error messages (spec. lengths at SYSCALL_WRITE): -------- */
.section ".gsdata", .data
.globl ___STARTED_PROCS__
___STARTED_PROCS__:.int 0    /* dummer default, wird im startupcode gesetzt */
.globl forklib_shmoverflow
forklib_shmoverflow:.ascii "shmalloc: permanent shared heap size exceeded, sorry\n" 
.globl forklib_shoverflow
forklib_shoverflow:.ascii "shalloc: shared heap size exceeded, sorry\n" 
.globl forklib_proverflow
forklib_proverflow:.ascii "malloc: private heap size exceeded, sorry\n" 
.globl forklib_shstoverflow
forklib_shstoverflow:.ascii "shared stack overflow, sorry\n" 
.globl forklib_prstoverflow
forklib_prstoverflow:.ascii "private stack overflow, sorry\n" 
.globl forklib_wronggroupnr
forklib_wronggroupnr:.ascii "wrong group number\n" 
.globl forklib_sprocsexceeded
forklib_sprocsexceeded:.ascii "start: not so many processors allocated\n"
.globl forklib_divisionbyzero
forklib_divisionbyzero:.ascii "division by zero\n"
.globl permsheapptr
permsheapptr:.word 0
.globl permsheaplimit
permsheaplimit:.word 0
.globl __ftb
__ftb:.word 0
/* counters for shared memory access statistics 990119 */
.globl __shldgcnt     /* counts user loads from shared addresses */
__shldgcnt:.word 0
.globl __shstgcnt     /* counts user stores to shared addresses */
__shstgcnt:.word 0
.globl __mpaddcnt     /* counts user mpadds/syncadds to shared addresses */
__mpaddcnt:.word 0
.globl __mpmaxcnt     /* counts user mpmaxs/syncmaxs to shared addresses */
__mpmaxcnt:.word 0
.globl __mpandcnt     /* counts user mpands/syncands to shared addresses */
__mpandcnt:.word 0
.globl __mporcnt      /* counts user mpors/syncors to shared addresses */
__mporcnt:.word 0

.section ".lddata", .bss
/*     Der Loader schreibt ab Zelle 0 die folgenden Eintraege:
 *        (vgl. ~bird/.../sbo_startup/lddata.h)
 *     0> Anzahl gestarteter Prozessoren
 *     1> L"ange des Textsegmentes
 *     2> L"ange des lokalen Stacks (= ignoriert fuer FORK,
              als Synchronisationszelle missbraucht)
 *     3> L"ange des lokalen Heaps (= ignoriert fuer FORK)
 *     4> L"ange des reservierten globalen Speichers
 *     5> L"ange der section ".lddata"
 *     6> L"ange der section ".gsdata"
 *     7> L"ange der section ".gsbss"
 *     8> L"ange der section ".gpdata"
 *     9> L"ange der section ".gpbss"
 *    10> L"ange der section ".args" (Argumente argc, argv, Reservepointer,
 *                                              argv[1], argv[2], ....)     */
.globl sbo_numprocs
sbo_numprocs:.word 0
.globl sbo_textlen
sbo_textlen:.word 0
.globl sbo_pstacklen
sbo_pstacklen:.word 0
.globl sbo_pheaplen
sbo_pheaplen:.word 0
.globl sbo_gmlen
sbo_gmlen:.word 0
.globl sbo_lddatalen
sbo_lddatalen:.word 0
.globl sbo_gsdatalen
sbo_gsdatalen:.word 0
.globl sbo_gsbsslen
sbo_gsbsslen:.word 0
.globl sbo_gpdatalen
sbo_gpdatalen:.word 0
.globl sbo_gpbsslen
sbo_gpbsslen:.word 0
.globl sbo_argslen
sbo_argslen:.word 0

.section ".text", .text

.globl forklib_startup        /*Startup - Code*/
forklib_startup:
                    /*PARAMETER:  keine, aber
                     * die notwendige Anzahl gestarteter Prozessoren
                     * (eine compile-time-Konstante!!!) steht in der
                     * Zelle 0 des globalen PRAM-Speichers. Wird im shared
                     * Datensegment (.gsdata) unter ___STARTED_PROCS__
                     * f"ur den Programmierer sichtbar gemacht.
                     * (gesetzt ueber Compiler-Option -nprocs)
                     */

/* Zunaechst die Voreinstellungen: */
reset_ex: /* SUPER-Flag im Statusregister setzen */
gethi    SUPERFlag, r5
add	 r5,SUPERFlag&0x1fff,r5
gethi	 0xffffffff, r6
add  	 r6,0xffffffff&0x1fff, r6
putsr	 r5, r6
gethi    0x000ffdc0,par1
add      par1,0x000ffdc0&0x1fff,par1
putmod   R0,par1           /*setze Modusregister*/

/* Als erstes werden die zukuenftigen privaten Bereiche berechnet: */
bmc      0              /*assert next mo=0*/
gethi    sbo_numprocs,par1
add      par1,sbo_numprocs&0x1fff,par1
ldg      par1,0,par1      /*lade Anzahl der gestarteten Prozessoren, 0*/
nop
/*set base registers for private memory addressing*/
/*size of a private memory: stack + heap + gpdata + gpbss + args  */
gethi    sbo_gpdatalen,r23
add      r23,sbo_gpdatalen&0x1fff,r23
ldg      r23,0,r23    /*sizeof gpdata*/         /*0*/
nop
gethi    sbo_gpbsslen,r22
add      r22,sbo_gpbsslen&0x1fff,r22
ldg      r22,0,r22    /*sizeof gpbss*/          /*0*/
gethi    sbo_pstacklen,r20
add      r20,sbo_pstacklen&0x1fff,r20
mov      r20,r18       /*Stackgroessenzelle (ex 3) als Synczelle missbraucht*/
ldg      r20,0,r20     /*sizeof stack  (heap=0 gesetzt in loader-sim), 0*/
add      r23,r22,r28
gethi    sbo_argslen,r21
add      r21,sbo_argslen&0x1fff,r21
ldg      r21,0,r21    /*sizeof args*/          /*0*/
add      r20,r28,r28
add      r21,r28,r28    /*r28 = size of private memory*/ /*0*/
mul      par1,r28,par3  /*berechne Gesamtgroesse d. PM, 1*/
gethi    sbo_gmlen,par4
add      par4,sbo_gmlen&0x1fff,par4
ldg      par4,0,par4       /*Gesamtspeichergroesse, 0*/
stg      par1,r18,0      /*setze Synchronisationszelle *(r18), vorgezogen 0 */
sub      par3,par4,Ret   /*Ret:= size of SM, 0*/
add      Ret,-1,eps      /*setze eps, 1*/

/*Vorlauf der Synchronisation: garantiere par1 in Synczelle 3*/
FORKLIB_NOCHMAL:
ldg      r18,0,r31           /*0*/
nop
sub      r31,par1,pc
bne      FORKLIB_NOCHMAL    /*jetzt sieht jeder Prozessor par1 in Zelle 3*/

/*nun synchronisiere via Zelle *(r18), ex Zelle 3:*/
add	 R0,-1,r26	 /*sync,0*/
mpadd	 r18,0,r26	 /*sync,1*/
nop
nop
FORKLIB_STARTUPSYNCLOOP:
ldg	 r18,0,r26	 /*sync:loop,0*/
nop	          	 /*sync:delay slot,1*/
add	 r26,0,r26	 /*sync:cmp ret>0?,0*/
bne	 FORKLIB_STARTUPSYNCLOOP /*sync:alle da?, 1*/
ldg	 r18,0,r26       /*sync:cmp Sync,0*/
add	 R0,1,r25	 /*sync:1*/
add      r26,0,r26       /*compare with 0, 0*/
mpadd	 r18,0,r25	 /*repair sync cell,1, computes $ in r25*/
bne	 FORKLIB_STARTUPSYNCHRON /*sync:Erg.v.cmp, 0*/
nop	         	 /*sync:2.Welle, 1*/
nop	         	 /*sync:2.Welle, 0*/
FORKLIB_STARTUPSYNCHRON:

/*consecutive processor numbers $=0,... are available in r25: */
gethi    0x80000000,r18                                /*1*/
mul      r25,r28,r29                                   /*0*/
add      r29,Ret,r29     /*r29 := Offset fuer PM von $,   1*/
sub      r18,r29,r20
putbas   r20             /*setze base register,  1*/

/*die sections .gpdata, .gpbss und .args werden jetzt
 *in die zukuenftigen privaten Bereiche kopiert.
 *Parameter movb: par1=source, par2=destination, par3=blocksize. */
gethi    sbo_lddatalen,par1
add      par1,sbo_lddatalen&0x1fff,par1
ldg      par1,0,par1       /*berechne Index d.ersten Zelle von .gpdata:,  0*/
mov      r18,app         /*app = (base+) first private location     1*/
gethi    sbo_gsdatalen,par2    /* as |lddata|+|gsbss|+|gsdata| (+1) */
add      par2,sbo_gsdatalen&0x1fff,par2
ldg      par2,0,par2                                         /*0*/
add      app,r22,app     /*app += sizeof gpbss              *1*/
gethi    sbo_gsbsslen,par4
add      par4,sbo_gsbsslen&0x1fff,par4
ldg      par4,0,par4                                         /*0*/
add      par1,par2,par1                                    /*1*/
add      par1,par4,par1  /*source address in par1*/        /*0*/
mov      par1,gps        /*setze gps*/                     /*1*/
add      app,r23,app     /*app += sizeof gpdata = Anfang von args,0 */
add      r23,r22,par3    /*sizeof gpbss + gpdata */        /*1*/
add      r21,par3,par3   /*block size =sizeof(gpdata+gpbss+args)  0*/
mov      r18,par2        /*destination: 0x80000000 + base)         1*/
add      par1,1,par1     /*TEST CWK, hiermit klappts jedenfalls*/
add      par2,par3,gpp   /*setze gpp   1*/
add      gpp,3,spp       /*setze spp,  0*/
mov      spp,fpp         /*setze fpp   1*/
getlo    forklib_movb,Ret                                       /*0*/
jsrg     spp,Ret,0       /*Block-Move                             1*/

/* setze uebrige Register: */
add      r18,r28,epp     /*epp := 0x80000000 + sizeof PM - 1:     0*/
add      epp,-1,epp      /*setze epp,  1*/
add      gps,2,sps       /*setze sps   0*/
mov      sps,fps         /*setze fps   1*/
getlo    0,aps           /*aps: keine shared Argumente von main erlaubt!,0*/

/*store #started processes = r31 in  ___STARTED_PROCS: */
gethi    ___STARTED_PROCS__,par1                                /*1*/
add      par1,___STARTED_PROCS__&0x1fff,par1                    /*0*/
stg      r31,par1,0                                             /*1*/

/*init synchronization cell SM[gps+1] with ___STARTED_PROCS__: 950413*/
stg      r31,gps,1     /*preset synchronization cell        0*/
#if 0
/*init synchronization cell SM[gps+1] with 1: */
getlo    1,r31
nop
#endif
gethi    ___PROC_NR__,par1                                /*1*/
#if 0
ersetze
stg      r31,gps,1     /*preset synchronization cell        1*/
durch
#endif
/*and store original processor number $==r25 in ___PROC_NR__:*/
add      par1,___PROC_NR__&0x1fff,par1                      /*0*/
stg      r25,par1,0                                         /*1*/

/*set group number @=0 for all processes and store them in PM[1] */
getlo    0,par1                                             /*0*/
stg      par1,gpp,1     /*@=0*/                             /*1*/
/*and $ in PM[2]: */
stg      r25,gpp,2                                         /*1*/

#if 0
/* 13.4.95 CWK: outside start() now asynchronous with ALL 
STARTEDPROCS processors remaining active! -> Bus */

/*disable all processes but that with  $=0: */
mov      r25,r25        /*$>0 => setze SHADOW*/             /*0*/
beq      FORKLIB_START_NOPS /*bleibe synchron!                1*/
getsr    par1                                               /*0*/
or       par1,SHADOW,par1                                   /*1*/
putsr    par1,par1                                          /*0*/
bra      FORKLIB_START_CONT                                 /*1*/
FORKLIB_START_NOPS:
nop                /* Stillbeschaeftigung fuer Proz. $==0     0*/
nop
nop
nop                                                         /*1*/

FORKLIB_START_CONT:
#endif
/* call the asynchronous init_files routine: */
getlo    init_perm_sheap,Ret
jsrg     spp,Ret,0
getlo    forklib_sync,Ret     /*re-synchronize*/
jsrg     spp,Ret,0
getlo    ___init_files, Ret   /* needs shmalloc */
jsrg     spp,Ret,0
getlo    forklib_sync,Ret     /*re-synchronize*/
jsrg     spp,Ret,0
getlo    _main,Ret                                          /*0*/
jsrg     spp,Ret,0      /*call main program*/               /*1*/
getlo    0,par1         /*regular exit(0)*/                 /*0*/
bra      __exit   /*quit fork program without calling atexit functions *1*/
/* now we have the following zeropage settings:
 * PM:  3 |                |<-spp,  SM: 3 |                 |
 *      2 | $ (updatable)  |  fpp       2 | (Schmutz)       |<-fps,sps   
 *      1 | @=0 (updatable)|            1 | 1 (sync cell)   |      
 *      0 | $ (fix)        |<-gpp,      0 | __STARTED_PROCS |<-gps
 *        +----------------+              +-----------------+    */
/* Ende forklib_startup*/


.globl _exit          /*exit program, call atexit() functions */
_exit: /*parameter: Ret=return val to OS*/
nop              /*spaeter: call atexit functions*/
bra       __exit


.globl __exit
__exit:       /*quit program via syscall*/

bsrg      spp,_sys_exit
ret        /*return to operating system*/
nop
/* Ende __exit */


.globl _reltoabs
_reltoabs:
/* convert par1 (possibly relative to BASE) to an absolute address: 950428 CWK*/
gethi    0x10000000,Ret
and      Ret,par1,pc    /*test if leftmost bit set*/
bne      3              /*if set, convert to absolute value*/
return
mov      par1,Ret       /*otherwise, return unchanged*/
/*convert to absolute value:*/
getbas   Ret            /* load base register */
return
add      Ret,par1,Ret    /* add base register to produce absolute address */


.globl _getct
_getct:
/* return current value of (global) counter */
return
getct    Ret


/* the synchronization routine is externally called barrier() */
.globl forklib_sync
.globl _barrier
forklib_sync: /*no parameter; uses r31, r30*/
_barrier:
#ifdef PROFILING
gethi    __barrcnt,r30   /*moved from gen.c 990504*/
add      r30,__barrcnt&0x1fff,r30
getlo    1,r31
syncadd  r31,r30,0       /*count barrier*/
/*keine Rundentrennung erforderlich, da nur syncadd-Zugriffe
 *auf diese Counter erfolgen. Lesen erst in printAccStat()
 */
pshg     r29,spp         /*990504*/
getct    r29             /*time start*/
#endif
bmc      0               /*force modulo 0*/
add	 R0,-1,r30	 /*sync,0*/
mpadd	 gps,1,r30	 /*sync,1*/
/*nop                      /*delay of mpadd*/
/*nop                      /*modulo ++*/
FORKLIB_SYNCLOOP:
ldg	 gps,1,r30	 /*sync:loop,       0*/
getlo    1,r31  	 /*sync:1, statt nop1*/
add	 r30,0,r30	 /*sync:cmp ret>0?, 0*/
bne	 FORKLIB_SYNCLOOP /*sync:alle da?,  1*/
ldg	 gps,1,r30       /*sync:cmp Sync,   0*/
mpadd	 gps,1,r31	 /*repair sync cell,1*/
add      r30,0,r30       /*compare with 0,  0*/
bne	 FORKLIB_SYNCHRON /*sync:Erg.v.cmp, 1*/
nop	         	 /*sync:2.Welle,    0*/
nop	         	 /*sync:2.Welle,    1*/
FORKLIB_SYNCHRON:
#ifdef PROFILING
getct    r30             /*time stop*/
sub      r29,r30,r30     /*time difference = waittime + overhead*/
gethi    __barrWaitTime,r31
add      r31,__barrWaitTime&0x1fff,r31
ldg      r31,0,r29 
popg     spp,r29         /*restore r29*/
add      r29,r30,r30     /*uses previous r29 value*/
stg      r30,r31,0
#endif
return                   /*sync:Ende,       0*/
nop                                       /*1*/
/*the following instruction's modulo is 0. do not change this invariant!*/
/* Ende forklib_sync */

.globl _shalloc  /*vormals forklib_shalloc*/
_shalloc: /*par1 = #allok. Speicherzellen*/
          /*uses par2,par3 if warning is emitted*/
/* nop
 * bsrg     spp,forklib_sync     */      /*1*/
sub      par1,eps,eps    /*shalloc,   0*/
bvs      3               /*underflow (epp-msb wird 0)? 1*/
sub      sps,eps,pc      /*compare,   0*/
bgt      FORKLIB_EXITSHALLOC /*OK ?   1*/
/*Prozessor 0 macht Fehlermeldung:*/
gethi    ___PROC_NR__,Ret              /*0*/
add      Ret,___PROC_NR__&0x1fff,Ret   /*1*/
ldg      Ret,0,Ret                     /*0, Indirektion*/
nop                                    /*1,delay*/
mov      Ret,Ret                       /*0*/
bne      FORKLIB_SHALLOC_RETURNS_NULL  /*1*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_shoverflow,par2
add      par2,forklib_shoverflow&0x1fff,par2
getlo    42,par3    /*length*/
sysc                /*write test string */
FORKLIB_SHALLOC_RETURNS_NULL:
/*bsrg     spp,forklib_sync       */              /*1*/
return
getlo    0,Ret                                /*0*/
FORKLIB_EXITSHALLOC:
/*bsrg     spp,forklib_sync*/
return    /*Returnpointer ist eps,    0*/
mov      eps,Ret                    /*0*/
/* Ende forklib_shalloc*/


.globl _shallfree  /*release objects shalloc()ed so far in current function*/
_shallfree: /*no parameter, no return value*/
/*first: test whether this is an asynchronous function ([fpp,-1]==fpp_old)
 *       i.e., whether there is a relative address in [fpp,-1]: */
ldg      fpp,-1,Ret
gethi    0x80000000,r31
and      Ret,r31,r31   /*if true, this function is asynchronous*/
beq      3
/*restore eps for asynchronous function:*/
return
ldg      gpp,-2,eps    /*restore eps from private group frame*/

/*restore eps for synchronous function:*/
return
ldgn     fpp,-1,eps    /*restore eps from private sync procedure frame*/


.globl _malloc        /*vormals forklib_malloc*/
_malloc:  /*par1 = #allok. Speicherzellen*/
          /*returns resulting absolute address in Ret */
          /*uses par2,par3 if warning is emitted*/
mov      par1,par1                           /*0*/
beq      FORKLIB_MALLOC_RETURNS_NULL
sub      par1,epp,epp    /*malloc, 0*/
bvs      3               /*underflow (epp-msb wird 0)?*/
sub      spp,epp,pc      /*compare 0*/
bgt      FORKLIB_EXITMALLOC /*no overflow?, 1*/
/* additional feature: active processor prints a message if heap size exceeded */
getsr    Ret
and      Ret,SHADOW,Ret
bne      FORKLIB_MALLOC_RETURNS_NULL
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_proverflow,par2
add      par2,forklib_proverflow&0x1fff,par2
getlo    42,par3    /*length*/
bms      0          /*assert next mo=1*/
sysc                /* write message */
FORKLIB_MALLOC_RETURNS_NULL:
return
getlo    0,Ret                                /*0*/
FORKLIB_EXITMALLOC:
/* now translate epp (relative to BASE) to an absolute address: 950216 CWK*/
getbas   Ret            /* 0, load base register */
nop
return                  /* 0 */
add      Ret,epp,Ret    /* 1 add base register to produce absolute address */
/* mov      epp,Ret   so war's frueher ...           /*0*/
/* Ende _malloc */


.globl _free    /*vorlaeufig dummy fn*/
_free:  /*par1 = freizugebender Speicherbereich*/
return  /*no return value*/
nop


.globl _realloc
_realloc:  /*par1 = pointer auf malloc-allokierten Block*/
           /*par2 = new size*/
           /*uses par3*/
mov     par1,r30                    /*0*/
mov     par2,r31
mov     par2,par1
bsrg    spp,_malloc
mov     Ret,par2                 /*destination*/
beq     FORKLIB_EXITREALLOC      /*falls malloc NULL lieferte: nix kopieren*/
mov     r30,par1                 /*source*/
mov     r31,par3                 /*size*/
nop
bsrg    spp,forklib_movb
FORKLIB_EXITREALLOC:
return
nop

.globl stack_overflow      /*wird von hirblis libc.a benoetigt.*/
stack_overflow:            /*print message "pr. stack overflow"*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_prstoverflow,par2
add      par2,forklib_prstoverflow&0x1fff,par2
getlo    30,par3    /*length*/
sysc                /*write msg string*/
add      R0,1,Ret        /*exit value =1     0*/
bra      _exit    /*abort program     1*/

.globl forklib_sstacktest
forklib_sstacktest:  
         /*no parameter, no return value*/
sub      sps,eps,pc      /*compare*/       /*0*/
bgt      FORKLIB_EXITSSTACKTEST  /*ok ?*/  /*1*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_shstoverflow,par2
add      par2,forklib_shstoverflow&0x1fff,par2
getlo    29,par3    /*length*/
bms      0          /*assert next mo=1*/
sysc                /*write msg string*/
add      R0,1,Ret        /*exit value =1     0*/
bra      _exit    /*abort program     1*/
FORKLIB_EXITSSTACKTEST:
return                                     /*0*/
nop                                        /*1*/
/* Ende forklib_sstacktest*/

.globl forklib_pstacktest
forklib_pstacktest:
         /*no parameter, no return value*/
sub      spp,epp,pc      /*compare*/   /*0*/
bgt      FORKLIB_EXITPSTACKTEST   /*ok?, 1*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_prstoverflow,par2
add      par2,forklib_prstoverflow&0x1fff,par2
getlo    30,par3    /*length*/
bms      0          /*assert next mo=1*/
sysc                /*write msg string*/
add      R0,1,Ret        /*exit value =1, 0*/
bra      _exit    /*abort program*/
FORKLIB_EXITPSTACKTEST:
return                                  /*0*/
nop                                     /*1*/
/* Ende forklib_pstacktest*/

.globl forklib_printwronggroupnr
forklib_printwronggroupnr:
         /*no parameter, no Ret*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_wronggroupnr,par2
add      par2,forklib_wronggroupnr&0x1fff,par2
getlo    19,par3    /*length*/
bms      0          /*assert next mo=1*/
sysc                /*write msg string*/
/*ldg      gpp,1,par2      *@                      0*/
add      R0,1,Ret        /*exit value =1          0*/
bra      _exit    /*abort program*/
/*Ende forklib_printwronggroupnr*/

.globl forklib_divu  /*unsigned int - Division */
forklib_divu:
     /* par1 = Dividend,
      * par2 = Divisor
      * par3 = Rest modulo Divisor (rueck)
      * par4 = High(Dividend), Hilfsvariable
      * Ret  = Ergebnis der Division
      * r30  = 0xffffffff als Hilfskonstante, wegoptimiert 
      * r31  = Counter  */
/*teste,ob Divisor Zweierpotenz:*/
mov   par2,par2
beq   FORKLIB_DIVUBYZERO
getlo 1,par4         /* 1*/
rm    par2,r31
lsl   par4,r31,par4  /*par4 := 2^log(par2) */
sub   par4,par2,pc
beq   FORKLIB_SHIFTDIVU /*Spezialfall abfangen*/
add   R0,0,Ret     /*init result := 0  */
add   R0,0,par4    /*init high := 0    */
add   R0,32,r31    /*init counter := 32*/
add   R0,-1,r30    /*r30:= 0xffffffff*/
FORKLIB_DIVULOOP:
rocl  par1,1,par1   /*msb(Dividend)->Carry*/
rocl  par4,1,par4   /*carry->lsb(High)  */
sub   par2,par4,par3 /*Divisor abziehen*/
blt   5             /*wegoptimiert nach obiger Annahme*/
rocl  r30,1,r30     /*set carry := 1 */
rocl  Ret,1,Ret     /*carry->lsb(result)*/
mov   par3,par4
bra   2             /*wegoptimiert nach obiger Annahme*/
lsl   Ret,1,Ret     /*0 -> lsb(result) */
add   r31,-1,r31    /*decrement counter*/
bne   FORKLIB_DIVULOOP
bmc   0             /*force_modulo_0*/
return  /* Divisionserg. in Ret, Rest in par3, 0*/
nop 
FORKLIB_SHIFTDIVU:   /*Spezialfall: Ergebnis kann durch Shiften
                       gefunden werden. Distanz steht in r31 */
asr   par1,r31,Ret   /*weil par1 / (2^k) == par1 >> k  */
getlo 1,r30
sub   r30,par4,par4  /*2^log(par2) -1 */
and   par1,par4,par3 /*weil par1 % (2^k) == par1 & (2^k - 1) */
bmc     0            /*force_modulo_0*/
return  /* Divisionserg. in Ret, Rest in par3, 0*/
nop 
FORKLIB_DIVUBYZERO:  /*Divisor ist 0 => Fehlermeldung ausgeben*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_divisionbyzero,par2
add      par2,forklib_divisionbyzero&0x1fff,par2
getlo    17,par3    /*length*/
sysc                /*write msg string*/
add      R0,1,Ret        /*exit value =1     0*/
bra      _exit    /*abort program     1*/


.globl forklib_divi  /*signed int - Division */
forklib_divi:
     /* par1 = Dividend,
      * par2 = Divisor
      * par3 = Rest modulo Divisor (rueck)
      * par4 = High(Dividend), Hilfsvariable
      * Ret  = Ergebnis der Division
      * r30  = 0xffffffff als Hilfskonstante, wegoptimiert 
      * r31  = Counter  */
/*bestimme zunaechst Vorzeichen und bilde Betraege von par1,par2: */
getlo 1,par4         /* +1, fuer VZ, aber auch zum Test auf 2erPotenz benoetigt*/
sub   par4,R0,par3   /* -1, fuer Invertierung bei negativem VZ*/
gethi 0x80000000,r31
and   r31,par1,r30   /*r30 := msb(Dividend)*/
beq   2
mul   par1,par3,par1 /*par1 := -par1 */
and   r31,par2,r31   /*r31 := msb(Divisor)*/
beq   2
mul   par2,par3,par2 /*par2 := -par2 */
add   r31,r30,pc
bpl   3              /*Vorzeichen Dividend, Divisor gleich?*/
sub   par4,R0,Ret    /*VZ := -1 */
bra   2
getlo 1,Ret          /*VZ := 1 */
bms   0              /*force modulo 1*/
pshg  Ret,spp        /*save sign on stack*/
nop
bsrg  spp,forklib_divu
popg  spp,r30        /*get sign of result*/
nop                  /*delay*/
mul   r30,Ret,Ret    /*set correct sign*/
nop                  /*modulo-nop*/
return  /* Divisionserg. in Ret, Rest in par3, 0*/
nop 


.globl forklib_fdiv      /*Floatingpoint-Division a/x */
forklib_fdiv:
        /* par1: Dividend a
           par2: Divisor x
           uses Ret, r31, par3 and par4 as scratch registers
           Ret: returns result
         */
gethi	2147483647, par3           /*0x7fffffff*/
add	par3, (2147483647)&0x1fff, par3
and	par2, par3, par3

gethi	2139095040, r31            /*0x7f800000*/
and	par3, r31, r31
gethi	2122317824, Ret            /*0x7e800000*/
sub	r31, Ret, Ret

fmul	par3, Ret, r31             /*x*y*/
gethi	0x40000000, par4           /*IEEE representation of 2.0 */
add	par4, 0x40000000&0x1fff, par4
fsub	r31, par4, r31             /*2.0 - x*y*/
fmul	Ret, r31, Ret              /* (  ''  ) * y */

fmul	par3, Ret, r31
fsub	r31, par4, r31
fmul	Ret, r31, Ret

fmul	par3, Ret, r31
fsub	r31, par4, r31
fmul	Ret, r31, Ret

fmul	par3, Ret, r31
fsub	r31, par4, r31
fmul	Ret, r31, Ret

fmul	par3, Ret, par3
fsub	par3, par4, par4
fmul	Ret, par4, Ret

gethi	-2147483648, r31            /*0x80000000*/
and	par2, r31, par2             /*sign bit of divisor*/
fmul	Ret, par1, Ret              /*a * (1/x)*/
return
xor	Ret, par2, Ret              /*adjust sign bit of result*/


.globl forklib_movb     /*block-move forward, fixed size*/
forklib_movb:
        /*par1 = Source-Adresse*/
        /*par2 = Destination-Adresse*/
        /*par3 = Block-size (words)*/
ldg     par1,0,Ret  /*prefetch first*/  /*0*/
mov     par3,par3                       /*1*/
beq     FORKLIB_MOVB_FERTIG             /*0*/
FORKLIB_MOVBLOOP:
pshg    Ret,par2     /*flush current item 1*/
popng   par1,1,Ret   /*prefetch next*/  /*0*/
add     par3,-1,par3 /*decrement blk size,1*/
bne     FORKLIB_MOVBLOOP                /*0*/
FORKLIB_MOVB_FERTIG:
/*nop                                     /*1*/
return  /*no result*/                   /*0*/
nop                                     /*1*/

.globl _memcpy   /* wie forklib_movb, mit par1 <-> par2 */
_memcpy:
        /*par1 = Destination-Adresse*/
        /*par2 = Source-Adresse*/
        /*par3 = Block-size (words)*/
add     par2,-1,par2   /*popng uses offset 1*/ 
popng   par2,1,r31  /*prefetch first*/
mov     par1,Ret
mov     par3,par3
beq     FORKLIB_MEMCPY_FERTIG
FORKLIB_MEMCPY_LOOP:
pshg    r31,par1     /*flush current item 1*/
popng   par2,1,r31   /*prefetch next*/  /*0*/
add     par3,-1,par3 /*decrement blk size,1*/
bne     FORKLIB_MEMCPY_LOOP                /*0*/
FORKLIB_MEMCPY_FERTIG:
return  /*no result*/                   /*0*/
nop                                     /*1*/


.globl _strcpy     /*block-move forward bis \0-Ende, asynchronous*/
_strcpy:
        /*par1 = Destination-Adresse*/
        /*par2 = Source-Adresse*/
add     par2,-1,par2  /*popng uses offset 1*/
popng   par2,1,Ret   /*prefetch next*/
mov     par1,r30     /*save return value*/
mov     Ret,Ret
beq     FORKLIB_STRCPY_FERTIG
FORKLIB_STRCPY_LOOP:
popng   par2,1,Ret   /*prefetch next*/
pshg    Ret,par1     /*flush current item */
mov     Ret,Ret      /*look at new item */
bne     FORKLIB_STRCPY_LOOP
FORKLIB_STRCPY_FERTIG:
return  /*no result*/
mov     r30,Ret      /*return pointer to dest */


.globl _strcmp     /*string-compare with no upper limit*/
_strcmp:
        /*par1 = string 1, par2 = string 2 */
        /*Ret < 0 falls string1 <lex string2, 0 falls ==lex, >0 falls >lex*/
getlo   0,Ret
add     par1,-1,par1
add     par2,-1,par2
FORKLIB_STRCMP_LOOP:
popng   par1,1,r31
popng   par2,1,r30 
nop                  /*delay*/
mul     r30,r31,pc             /*finish if at least one string is finished*/
beq     FORKLIB_STRCMP_FERTIG
sub     r30,r31,Ret  /*compare current items, finish if different*/
beq     FORKLIB_STRCMP_LOOP
FORKLIB_STRCMP_FERTIG:
return  /*Ret*/
sub     r30,r31,Ret  /*compute result for \0 character, otherwise just repeat */


.globl _memcmp
.globl _strncmp     /*string-compare with upper limit*/
_strncmp:
_memcmp:
        /*par1 = string 1, par2 = string 2, par3 = maxsize */
        /*Ret < 0 falls string1 <lex string2, 0 falls ==lex, >0 falls >lex*/
getlo   0,Ret
add     par1,-1,par1
add     par2,-1,par2
FORKLIB_STRNCMP_LOOP:
popng   par1,1,r31
popng   par2,1,r30 
nop                  /*delay*/
mul     r30,r31,pc             /*finish if at least one string is finished*/
beq     FORKLIB_STRNCMP_FAST_FERTIG
sub     r30,r31,Ret  /*compare current items, finish if different*/
bne     FORKLIB_STRNCMP_FERTIG
add     par3,-1,par3 /*dec counter*/
bgt     FORKLIB_STRNCMP_LOOP
FORKLIB_STRNCMP_FERTIG:
return  /*Ret*/
nop
FORKLIB_STRNCMP_FAST_FERTIG:
sub     r30,r31,Ret  /*compute correct result for \0 character */
bra     FORKLIB_STRCPY_FERTIG



.globl _strlen     /*string length, asynchronous*/
_strlen:
        /*par1 = String-Adresse*/
        /*Ret => String-size (words)*/
getlo   0,Ret
FORKLIB_STRLEN_LOOP:
popng   par1,1,r31   /*prefetch next*/
add     Ret,1,Ret  /*increment Ret*/
mov     r31,r31    /*compare*/
bne     FORKLIB_STRLEN_LOOP
return  /*result is in Ret*/
nop


.globl _srand       /*seed random generator*/
_srand:
        /*par1: unsigned int seed */
gethi   forklib_randomnumber, Ret
add     Ret,forklib_randomnumber & 0x1fff,Ret
return
stg     par1,Ret,0


.globl _rand        /*random number generator*/
_rand:
gethi   forklib_randomnumber, r31
add     r31,forklib_randomnumber & 0x1fff,r31
ldg     r31,0,r30        
gethi   9749741,Ret
add     Ret,(9749741)&0x1fff,Ret
or      r30,0x1abc,r30 /*avoid zero factor*/
mul     r30,Ret,r30
ror     r30,7,Ret      /*to allow also even results*/
return
stg     Ret,r31,0


.globl _random        /*random number generator with parameter*/
_random:
/* address of old (and new) random number is in par1 */
ldg     par1,0,r30
gethi   9749741,Ret
add     Ret,(9749741)&0x1fff,Ret
mul     r30,Ret,Ret
ror     Ret,7,Ret      /*to allow also even results*/
return
stg     Ret,par1,0



/* ============ SYSCALLS: =================================== */

.globl _open
_open:            /* par1 = filename */
                  /* par2 = mode */
                  /* par3 = umask */
getlo   SYSCALL_OPEN,Ret
sysc
#ifdef FTELL_EXPLICIT
gethi   __filepos,r31
add     r31,__filepos&0x1fff,r31
add     r31,Ret,r31
stgc    r31       /* set current write/read position to zero */
#endif
return            /* Synchronisation entfaellt, da async */
nop               /* channel remains in Ret */


.globl _close
_close:           /* par1 = channel */
getlo   SYSCALL_CLOSE,Ret
sysc
return            /* Synchronisation entfaellt, da async */
nop               /* channel remains in Ret */


.globl _read
_read:            /* par1 = channel */
                  /* par2 = buffer */
                  /* par3 = length of buffer */
#ifdef FTELL_EXPLICIT
gethi   __filepos,r31
add     r31,__filepos&0x1fff,r31
add     r31,par1,r31   /* & filepos[channel] */
ldg     r31,0,r30
nop
add     r30,par3,r30
stg     r30,r31,0      /* increment current write/read position by length*/
#endif
getsr   r31
and     r31,SHADOW,r31
bne     4               /*shadowed processors skip read*/
nop
getlo   SYSCALL_READ,Ret
sysc
return            /* Synchronisation entfaellt, da async */
nop

.globl _write
_write:           /* par1 = channel */
                  /* par2 = buffer */
                  /* par3 = length of buffer */
#ifdef FTELL_EXPLICIT
gethi   __filepos,r31
add     r31,__filepos&0x1fff,r31
add     r31,par1,r31   /* & filepos[channel] */
ldg     r31,0,r30
nop
add     r30,par3,r30
stg     r30,r31,0      /* increment current write/read position by length*/
#endif
getsr   r31
and     r31,SHADOW,r31
bne     4               /*shadowed processors skip write*/
nop
getlo   SYSCALL_WRITE,Ret
sysc
return             /* Synchronisation entfaellt, da async */
nop

.globl _sys_exit
_sys_exit:        /* no parameters */
getsr   r31
and     r31,SHADOW,r31
bne     0               /*shadowed processors stay here forever*/
nop
getlo   SYSCALL_EXIT,Ret
sysc
return
nop

.globl _lseek
_lseek:           /* par1 = filedescriptor (channel #) */
                  /* par2 = offset */
                  /* par3 = origin */
/*Behandlung von _filepos in fseek/ftell siehe io.c */
getlo   SYSCALL_LSEEK,Ret
sysc
return            /* Synchronisation entfaellt, da async */
nop               /* result is in Ret */




.globl _async_groupsize
.globl _groupsize
_async_groupsize: /* the extra async version is necessary for consistency */
_groupsize:       /* get contents of synchronization cell. No parameters. */
bmc     0
ldg     gps,1,Ret
return
nop

.globl _parentgroupsize    /* get contents of parent group's synchr. cell. */
_parentgroupsize:          /* no async version necessary. -- No parameters. */
   /* Attention! This routine reports only the contents of the sync cell.
    * The parent group may actually have more processors, some of which
    * are already within the next synchronization routine. */
bmc     0
ldg     gps,0,Ret          /* pointer to parent group's shared group frame */
nop
ldg     Ret,1,Ret          /* load contents of the sync cell there */
return
nop

.globl __groupsize    /* get contents of parent(k) group's synchr. cell.*/
__groupsize:          /* Parameter: par1=k indicates k'th ancestor group */
   /* Attention! This routine reports only the contents of the sync cell.
    * The parent group may actually have more processors, some of which
    * are already within the next synchronization routine.
    * _groupsize(1) is equivalent to (but slower than) parent_groupsize() */
add     gps,0,r30
DO_ONE_INDIRECTION:
ldg     r30,0,r30     /* pointer to parent group's shared group frame */
add     par1,-1,par1
bgt     DO_ONE_INDIRECTION
bmc     0
ldg     r30,1,Ret     /* mo=0. load contents of the sync cell there */
return
nop



/* ====== low-level math routines: ====================== */

.globl _ftoi
_ftoi:
 /* par1 contains the floatingpoint operand */
 /* in Ret we return the integer corresponding to it */
ftoi    par1,Ret
return
nop
 
.globl _itof
_itof:
 /* par1 contains the floatingpoint operand */
 /* in Ret we return the integer corresponding to it */
itof    par1,Ret
return
nop


/* ====== implementation of mutual exclusion: =========== */

/* Many ideas for implementing mutual exclusion, locks, fair locks etc.
 * as done here were taken from the file "sbp_multiproc.h"
 * written by Jochen Roehrig,  (C) 1994 by bird@cs.uni-sb.de
 * at Saarbruecken University, LS Prof. W.J. Paul, SB-PRAM project,
 * within the framework of the p4gcc compiler libraries.
 * Documentation will be contained in Jochen's Master thesis
 * which will appear 1995 at Saarbruecken University, Germany.
 * Jochen's contribution is greatly appreciated.
 * ---- Christoph W. Kessler, in March 1995.
 */

.globl _simple_lockup                   /* using simple locks: */
_simple_lockup:
 /* par1   contains address of a shared variable (lock) */
 /*        no return value */
bmc     0            /* force next modulo = 0 */
getlo   1,r30        /* 0,prepare mpmax */
mpmax   par1,0,r30   /* 1,test&set the lock */
nop                  /* 0,delay slot */
add     r30,0,R0     /* 1,compare to 0 */
bne     -3           /* 0,try again if I'm not leader */
                     /* in that case, r30 holds still 1 */
return               /* leader returns, the others still wait */
nop

.globl _fair_lockup                   /* using fair locks: */
_fair_lockup:
 /* par1   contains address of a shared struct (fair lock):
  *        par1,0 denotes the next free number,
  *        par1,1 the currently working number */
 /*        no return value. uses r30 and r31. */
bms     0            /* force next modulo = 1 */
getlo   1,r30        /* 1,prepare mpadd */
mpadd   par1,0,r30   /* 0,get next number */
ldg     par1,1,r31   /* get currently working number */
nop                  /* delay */
sub     r31,r30,R0   /* compare the two counters */
bne     -3           /* if not my turn, iterate */
return               /* leader returns, the others still wait */
nop

.globl _fair_unlock                   /* using fair locks: */
_fair_unlock:
 /* par1   contains address of a shared struct (fair lock):
  *        par1,0 denotes the next free number,
  *        par1,1 the currently working number */
 /*        no return value. uses r30 */
bms     0            /* force next modulo = 1 */
getlo   1,r30        /* 1,prepare mpadd */
syncadd r30,par1,1   /* 0,increment currently working number */
return
nop

.globl _safe_lockup                   /* using safe locks: */
_safe_lockup:
 /* par1   contains address of a shared struct (safe lock):
  *        par1,0 denotes the proper lock,
  *        par1,1 the process ID of the current lock owner */
 /*        no return value. uses r30 and r31. */
getlo   1,r30        /* 0,prepare mpmax */
gethi   ___PROC_NR__,r31
add     r31,___PROC_NR__&0x1fff,r31
ldg     r31,0,r31
bms     0            /* force next modulo = 0 */
mpmax   par1,0,r30   /* try to catch lock */
nop                  /* delay */
add     r30,0,R0     /* compare */
bne     -3           /* retry if lock was locked */
stg     r31,par1,1   /* store __PROC_NR__ in lock.owner */
return
nop

.globl __safe_unlock                  /* using safe locks: */
__safe_unlock:
 /* par1   contains address of a shared struct (safe lock):
  *        par1,0 denotes the proper lock,
  *        par1,1 the process ID of the current lock owner */
 /*        no return value. uses r30 and r31 */
bmc     0            /* force next modulo = 0 */
getlo   0,r30
mpmax   par1,0,r30   /* mpmax 0 won't change state of lock */
ldg     par1,1,r31   /* load lock owner */
/*nop                /* delay slot */
add     r30,0,R0     /* set cc */
bne     3
/* try to unlock lock which isn't locked: We must not change the state of the
 * lock because it could have been locked by another process meanwhile */
return
getlo   1,Ret        /* return 1 (error code) */
/* else: lock was locked */
gethi   ___PROC_NR__,r30
add     r30,___PROC_NR__&0x1fff,r30
ldg     r30,0,r30
nop
sub     r31,r30,R0   /* compare lock owner with my process ID */
beq     3
/* try to unlock lock that the executing process doesn't own:
 * We must not change the state of the lock */
return
getlo   2,Ret        /* return 2 (error code) */
/* else: executing process is lock owner -> unlock it */
bmc     0
getlo   -1,r30
stg     r30,par1,1   /* new lock owner is Nobody (-1) */
getlo   0,r30
stg     r30,par1,0   /* new lock value is 0 */
return
getlo   0,Ret        /* return 0 (OK) */


.globl _count_barrier
_count_barrier:
/* _count_barrier ( c, n ) based on J.Keller's synchronization routine
 *        par1: address of semaphore { count 1, count 2 } struct
 *        par2: number of processors required to arrive in order to continue
 *        uses r30, r31. No return value. */
bmc     0
getlo   1,r30
syncadd r30,par1,0   /* increment count1 */
ldg     par1,0,r31   /* load count1 */
nop
sub     r31,par2,R0  /* compare count1==nprocs */
bne     -3           /* iterate until equal */
syncadd r30,par1,1   /* increment count2 */
ldg     par1,1,r31   /* load count2 */
add     par1,1,r30   /* address of count2 required below*/
sub     r31,par2,R0  /* compare count2==nprocs */
beq     3            /* slow processors continue */
nop                  /* fast processors: 2 nops */
nop
/* now the "n" processors are synchronous. */
stgc    par1       /* reset count1 */
nop
stgc    r30        /* reset count2 */
return
nop


.globl _pravail
_pravail:
/* returns the number of free private memory words for this processor */
return
sub     spp,epp,Ret


.globl _shavail
_shavail:
/* returns the number of free shared memory words for this group */
return
sub     sps,eps,Ret


.globl _modf
_modf:
/* takes 2 parameters, the number and the pointer to the integer part */
/* returns the fractional part in Ret */
ftoi    par1,r31      /*int part in r31*/
bge     2             /*for negative numbers: */
add     r31,1,r31     /*add 1*/
itof    r31,r31       /*back to floatingpoint*/
stg     r31,par2,0    /*store integer part*/
return               
fsub    r31,par1,Ret  /*subtract modified integer part from number*/
                      /*and store absolute value of result into Ret*/

.globl _floor
_floor:
/* takes 1 parameter (double) and returns floor(par1) in Ret. */
ftoi    par1,par1
return
itof    par1,Ret

#if 0
ftoi    par1,r31
itof    r31,r31
fsub    r31,par1,pc  /* if par1 < r31 then subtract 1.0 */
fboge   4
getlo   __one,r30
add     r30,0x1fff&__one,r30
fsub    r30,r31,r31
return
nop
#endif


.globl _ceil
_ceil:
/* takes 1 parameter and returns ceil(par1) in Ret. */
ftoi    par1,r31
itof    r31,r30
fsub    r30,par1,pc
fbeq    3
add     r31,1,r31
itof    r31,r30
return
mov     r30,Ret


.globl _ftox         /* fool the compiler */
_ftox:
return
mov    par1,Ret


#if 0
abs function occurs in stdlib.c
.globl _abs
_abs:
/* takes 1 parameter and returns abs(par1) in Ret */
mov    par1,Ret
bge    2
sub    par1,r0,Ret
return
nop
#endif


// this sqrt routine was buggy. 
// I have written a new one in math.c    CWK 990316
// .globl _sqrt
// _sqrt:
// #define pgcc_par1 par1
// #define pgcc_par2 par2
// #define pgcc_par3 par3
// #define pgcc_par4 par4
// #define pgcc_sc r31
// #define NAN 0x7fc00000
// /* #end prologue: regs=0, cfpas=0, vars=0 */
	// fadd	pgcc_par1, r0, Ret
// /* # !(nan+z+n) */
	// fbogt	LL2
	// fbeq	LL5
	// gethi	NAN, Ret
	// bra	LL5
// 
// LL2:	fbinf	LL5
	// asr	Ret, 24, pgcc_par3
	// sub	pgcc_par3, 64, pgcc_sc
	// lsl	pgcc_sc, 24, pgcc_sc
	// add	Ret, pgcc_sc, r30
	// gethi	0x3e800000, pgcc_par1
	// fmul	r30, pgcc_par1, pgcc_sc
	// gethi	0x40400000, Ret
	// fsub	pgcc_sc, Ret, pgcc_sc
	// fmul	pgcc_sc, pgcc_par1, pgcc_par4
	// gethi	0x3f000000, pgcc_par1
	// fmul	pgcc_par4, pgcc_par1, pgcc_par2
	// fmul	r30, pgcc_par4, pgcc_sc
	// fmul	pgcc_sc, pgcc_par4, pgcc_sc
	// fsub	pgcc_sc, Ret, pgcc_sc
	// fmul	pgcc_par2, pgcc_sc, pgcc_par4
	// fmul	pgcc_par4, pgcc_par1, pgcc_par2
	// fmul	r30, pgcc_par4, pgcc_sc
	// fmul	pgcc_sc, pgcc_par4, pgcc_sc
	// fsub	pgcc_sc, Ret, pgcc_sc
	// fmul	pgcc_par2, pgcc_sc, pgcc_par4
	// fmul	pgcc_par4, pgcc_par1, pgcc_par2
	// fmul	r30, pgcc_par4, pgcc_sc
	// fmul	pgcc_sc, pgcc_par4, pgcc_sc
	// fsub	pgcc_sc, Ret, pgcc_sc
	// fmul	pgcc_par2, pgcc_sc, pgcc_par4
	// fmul	pgcc_par4, pgcc_par1, pgcc_par2
	// fmul	r30, pgcc_par4, pgcc_sc
	// fmul	pgcc_sc, pgcc_par4, pgcc_sc
	// fsub	pgcc_sc, Ret, pgcc_sc
	// fmul	pgcc_par2, pgcc_sc, pgcc_par4
	// fmul	pgcc_par4, pgcc_par1, pgcc_par1
	// fmul	r30, pgcc_par4, pgcc_sc
	// fmul	pgcc_sc, pgcc_par4, pgcc_sc
	// fsub	pgcc_sc, Ret, Ret
	// fmul	pgcc_par1, Ret, pgcc_par4
	// add	pgcc_par3, 63, pgcc_par3
	// lsl	pgcc_par3, 23, pgcc_par3
	// fmul	pgcc_par4, r30, Ret
	// fmul	pgcc_par3, Ret, Ret
// LL5:
        // return
        // nop

/* sine and cosine function, async, math.h */

.section ".gsdata", .data
.globl __ahalf
__ahalf:.float 0e5.00000000e-01
.globl __three
__three:.float 0e3.00000000e+00
.globl __sininitfactor
__sininitfactor:.float 0e1.01610502e-04
.section ".text", .text

.globl _SinLoopX
_SinLoopX:
/* The approximation loop of the sine function in math.c
 * Operand x in par1, result in Ret, uses r30 for temporary. */
gethi	 __sininitfactor,Ret 	 
add	 Ret,(__sininitfactor)&0x1fff,Ret
ldgn	 Ret,0,Ret
gethi	 __three,r30 	 
add	 r30,(__three)&0x1fff,r30
ldgn	 r30,0,r30	 /*INDIRx*/
fmul     Ret,par1,Ret    /* Ret = x * 2/(3^9) */

fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */
fmul	 Ret,Ret,par1    /* \                           */
fsub	 par1,r30,par1   /* | Ret = Ret * (3 - Ret*Ret) */
fmul	 Ret,par1,Ret    /* /                           */

gethi	 __ahalf,r30 	 /*ADDRGP*/
add	 r30,(__ahalf)&0x1fff,r30  /*ADDRGP*/
ldgn	 r30,0,r30	 /*INDIRx*/
return
fmul     r30,Ret,Ret




/* PERMANENT SHARED HEAP MANAGEMENT */

.globl init_perm_sheap
init_perm_sheap:
/* synchron, void. */
/* permheapsize = (eps - sps) / 2 */
/* hieran wird noch zu drehen sein */
sub     sps,eps,Ret
asr     Ret,1,Ret
/* permsheaplimit = eps_alt */
gethi   permsheaplimit,r31
add     r31,permsheaplimit&0x1fff,r31
stg     eps,r31,0
/* permsheapptr = eps_neu = eps_alt - permsheapsize:*/
sub     Ret,eps,eps
gethi   permsheapptr,Ret
add     Ret,permsheapptr&0x1fff,Ret
stg     eps,Ret,0
return
nop


.globl _alloc  /*permanentes shared malloc, direkt ab heap*/
_alloc:    /*par1 = #allok. Speicherzellen*/
           /*uses par2,par3 if warning is emitted*/
/* asynchron, void * */
mov      par1,Ret
gethi    permsheapptr,r31
add      r31,permsheapptr&0x1fff,r31
mpadd    r31,0,Ret       /* auf permsheapptr nur mit mpadd zugr.->kein modulo*/
gethi    permsheaplimit,r30
add      r30,permsheaplimit&0x1fff,r30
ldg      r30,0,r30
add      Ret,par1,par1   /*Ret = upper end of allocated block*/
sub      par1,r30,pc     /*compare with permsheaplimit*/
bgt      FORKLIB_EXITSHMALLOC

/*Ausgabe Fehlermeldung:*/
getlo    SYSCALL_WRITE,Ret      /* write = syscall 2 */
getlo    2,par1     /* stderr = 2 */
gethi    forklib_shmoverflow,par2
add      par2,forklib_shmoverflow&0x1fff,par2
getlo    53,par3    /*length*/
sysc                /*write test string */

FORKLIB_SHMALLOC_RETURNS_NULL:
return
getlo    0,Ret

FORKLIB_EXITSHMALLOC:
return    /*Returnpointer ist Ret=par1    0*/
nop
/* Ende _alloc*/


.globl _syncadd
_syncadd:
bmc     0              /*force next modulo 0*/
return
syncadd par2,par1,0    /*modulo = 1*/
                       /*ohne mo=1 geht pqueue daneben!*/

.globl _syncmax
_syncmax:
bmc     0              /*force next modulo 0*/
return
syncmax par2,par1,0    /*modulo = 1*/

.globl _syncor
_syncor:
bmc     0              /*force next modulo 0*/
return
syncor par2,par1,0    /*modulo = 1*/

.globl _syncand
_syncand:
bmc     0              /*force next modulo 0*/
return
syncand par2,par1,0    /*modulo = 1*/


#if 0
/* Debug/Profile routines: */

.globl _myfputs

.globl __prologue
__prologue:
/* in par1 pointer auf Struktur. */
/* deren dritte Wort-Komponente enthaelt Adresse des Fn-Namens */
ldg   app,1,r30
gethi __ftb,r31
ldg   r30,2,r30
add   r31,__ftb&0x1fff,r31
ldg   r31,0,r31
pshg  app,spp
add   spp,-1,app
pshg  r30,spp    /* the name string */
pshg  r31,spp
bsrg  spp,_myfputs
mov   app,spp
ldg   spp,0,app
return
nop

.globl __epilogue
__epilogue:
return
nop

.globl __caller
__caller:
return
nop

#endif

.globl __get_app
__get_app:
return
mov app,Ret

.globl _fmpadd
_fmpadd:  /*par1 = target array base y of space >= 2*p*/
          /*par2 = operand for $*/
          /*Ret: prefix sum for $*/
          /*uses par3, par4, r30, r31 as scratch registers */
   ldg      gpp,2,par3     /*$*/
   getlo    1,r31      
   add      par1,par3,par1 /*$+p*/
   ldg      gps,1,r30  
   stg      par2,par1,0    /*y[$]=0*/
   add      par1,r30,par3
   stgc     par3           /*y[$+p]=0*/
 FMPADDLOOP:
   ldg      par1,r31,Ret 
   ldg      par1,0,par2 
   add      par1,r31,par4
   fadd     Ret,par2,Ret
   stg      Ret,par4,0  
   add      r31,r31,r31 
   sub      r30,r31,pc 
   blt      FMPADDLOOP 
   ldg      par1,0,Ret 
   return
   nop
 
