/**************************************************************************
*                                                                         *
*  Author      : Dr. Thomas Brandes, GMD, SCAI.LAB                        *
*  Copyright   : GMD St. Augustin, Germany                                *
*  Date        : Sep 93                                                   *
*  Last Update : Oct 97                                                   *
*                                                                         *
*  This Module is part of the DALIB                                       *
*                                                                         *
*  Module      : mapping                                                  *
*                                                                         *
*  Function:  Handling extensions of block or cyclic distributed          *
*             dimensions                                                  *
*                                                                         *
*  P  :  denotes always the number of processors in the considered        *
*        dimensions                                                       *
*                                                                         *
*  I  :  is always a relative processor position (1 <= I <= P)            *
*        this must not be own position                                    *
*                                                                         *
*  N  :  is always the number of elements for the distributed dimensions  *
*                                                                         *
*  Export :  local to DALIB                                               *
*  ========================                                               *
*                                                                         *
*   int dalib_block_owner (int P, int lb, int ub, int k)                  *
*                                                                         *
*   int dalib_cyclic_owner (int P, int lb, int ub, int k)                 *
*                                                                         *
*   void dalib_cyclic_range (P, I, lb, ub, low, high, stride,             *
*                       *loc_low, *loc_high, *loc_stride)                 *
*                                                                         *
*  IMPORT:                                                                *
*  =======                                                                *
*                                                                         *
*    intersect.c  :  computing on sections                                *
*                                                                         *
*  UPDATE:                                                                *
*  =======                                                                *
*                                                                         *
*     09/96  : block(m) according HPF standard                            *
*     10/97  : cyclic(m) now supported                                    *
*     10/97  : indirect(map) now supported                                *
*     10/97  : arbitrary(size,len,map) now supported                      *
*                                                                         *
**************************************************************************/

# include <stdio.h>
# include "dalib.h"

# undef DEBUG

/*********************************************************************
*                                                                    *
*   void dalib_calc_block_size (int P, int lb, int ub, int *bsize)   *
*                                                                    *
*********************************************************************/

void dalib_calc_block_size (P, lb, ub, bsize)

int P, lb, ub, *bsize;

{ int N;

  N = ub - lb + 1;

  if (*bsize <= 0) *bsize = (N + P - 1) / P;

#ifdef DEBUG
  printf ("%d: distribute block(%d) of %d elements onto %d procs\n",
           pcb.i, *bsize, N, P);
#endif

  if (P * (*bsize) < N)
 
    { char msg[120];

      sprintf (msg, "BLOCK(%d) of %d elements onto %d procs\n",
                     *bsize, N, P);
      dalib_internal_error (msg);
      dalib_stop();
    }

} /* dalib_calc_block_size */

/*********************************************************************
*                                                                    *
*  void dalib_block_size (int P, int pid, int B,                     *
*                         int global_size[2], int local_slice[2])    *
*                                                                    *
*  - dimension of global_size is BLOCK(B) distributed on P procs     *
*  - compute local slice on pid of P, 1 <= pid <= P                  *
*                                                                    *
*********************************************************************/

void dalib_block_size (P, pid, B, global_size, local_slice)

int P, pid, B;
int global_size [];  /* lb : ub  */
int local_slice [];  /* local_lb : local_ub */

{ int bsize;

  /* recalculate bsize if it B is 0 */

  bsize = B;
  if (bsize <= 0) bsize = (global_size[1]-global_size[0]+P) / P;

  local_slice[0] = global_size[0] + bsize * (pid-1);
  local_slice[1] = local_slice[0] + bsize - 1;

  if (local_slice[1] > global_size[1]) local_slice[1] = global_size[1];

#ifdef DEBUG
  printf ("%d of %d: BLOCK(%d->%d) has size global = %d:%d, local = %d:%d\n",
          pid, P, B, bsize, global_size[0], global_size[1],
          local_slice[0], local_slice[1]);
#endif

} /* dalib_block_size */

/*******************************************************************
*                                                                  *
*  void dalib_block_range (int P, int pid, int bsize,              *
*                          int global_size[2], global_range[3],    *
*                          int local_range[3])                     *
*                                                                  *
*   - computes local part of global range on pid (1 <= pid <= P)   *
*                                                                  *
*******************************************************************/

void dalib_block_range (P, I, bsize, global_size, global_range, local_range)

/* Input arguments : */

int P, I, bsize;
int global_size [];
int global_range [];

/* Output arguments : */

int local_range[];

{ int p_slice [2];

  dalib_block_size (P, I, bsize, global_size, p_slice);

  dalib_intersect_range_slice (global_range, p_slice, local_range);

} /* dalib_block_range */

/*******************************************************************
*                                                                  *
*  void dalib_block_slice (int P, int pid, int bsize,              *
*                          int global_size[2], global_slice[3],    *
*                          int local_slice[3])                     *
*                                                                  *
*   - computes local part of global slice on pid (1 <= pid <= P)   *
*                                                                  *
*******************************************************************/

void dalib_block_slice (P, pid, bsize, global_size, global_slice, local_slice)
   
int P, pid, bsize;
int global_size  [];
int global_slice [];
int local_slice [];

{ int p_slice [2];  /* local part on processor pid */

  dalib_block_size (P, pid, bsize, global_size, p_slice);

  dalib_intersect_slices (p_slice, global_slice, local_slice);

} /* dalib_block_slice */

/*******************************************************************
*                                                                  *
*  int dalib_block_owner (int P, int bsize, int lb, int ub, int k) *
*                                                                  *
*   - dimension of size lb:ub is block(bsize) distributed onto     *
*     P processors                                                 *
*   - returns owner pid of k, where lb<=k<=ub, 0 <= pid < P        *
*                                                                  *
*******************************************************************/

int dalib_block_owner (P, bsize, lb, ub, k)

int P, bsize, lb, ub, k;

{ int owner;
  int N;

  N = ub - lb + 1;

  /* this is an old implementation that does not match the standard :

     return ( ((k-lb+1) * P - 1) / N + 1);
  */

  if (bsize <= 0)

     { int B;
       B = (N+P-1)/P;
       owner = (k-lb)/B;
     }

    else
     owner = (k-lb)/bsize;

#ifdef DEBUG
  printf ("%d: owner of %d is %d\n", pcb.i, k, owner);
#endif

  return (owner);

} /* dalib_block_owner */

/**************************************************************************
*                                                                         *
*  void dalib_block_addresses (int P, int B, int lb, int ub,              *
*                              int N, int index[], int owner[])           *
*                                                                         *
*  - [lb:ub] is distributed BLOCK (B) onto P processors                   *
*  - add owner value (0 <= owner < P) of index [i] to owner[i]            *
*  - mask is hidden in owner (-1 for mask[i] is true)                     *
*                                                                         *
**************************************************************************/

void dalib_block_addresses (P, B, lb, ub, N, index, owner)

int P, B, lb, ub;

int index[];
int owner[];
int N;

{ int i;
  int bsize;

  bsize = B;
  if (bsize <= 0) bsize = (ub-lb+P)/P;

  /* owner is given by the number of the block */

#pragma vdir nooverlap (owner,index)
  for (i=0; i<N; i++) 

    if (owner[i] >= 0) 

       { int val;

         val = index[i];

         if (val < lb) owner[i] = -1;
          else if (val > ub) owner[i] = P;
          else owner[i] += (val - lb) / B; 
       }

} /* dalib_block_addresses */

/*******************************************************************
********************************************************************
******       C Y C L I C (1)                                  ******
********************************************************************
*******************************************************************/

/*******************************************************************
*                                                                  *
*  void dalib_cyclic_size (int P, int pid, int global_size[2],     *
*                          int local_size[3]                 )     *
*                                                                  *
*  - computes local part of global_size distributed CYCLIC(1)      *
*  - compute local part for pid 1 <= pid <= P                      *
*                                                                  *
*******************************************************************/

void dalib_cyclic_size (P, pid, global_size, local_size)

int P, pid;

int global_size[];  
int local_size [];   /* should have three entries */

{ int N;
  int pN;  /* size on P */

  N  = global_size[1] - global_size[0] + 1;
  pN = (N + P - pid) / P;

  if ((pid > 0) && (pid <= P))

     { local_size[0] = global_size[0] + pid - 1;
       local_size[1] = local_size[0] + (pN - 1) * P;
       local_size[2] = P;
     }

   else

    { local_size[0] = 1;
      local_size[1] = 0;
      local_size[2] = P;
    }

#ifdef DEBUG
  printf ("%d of %d: CYCLIC(1) has size global = %d:%d, local = %d:%d:%d\n",
          pid, P, global_size[0], global_size[1],
          local_size[0], local_size[1], local_size[2]);
#endif

} /* dalib_cyclic_size */

/**********************************************************************
*                                                                     *
*  void dalib_cyclic_slice (int P, int pid, int global_size[2],       *
*                           int global_slice[2], int local_range[3])  *
*                                                                     *
**********************************************************************/

void dalib_cyclic_slice (P, pid, global_size, global_slice, local_range)

int P, pid;
int global_size  [];
int global_slice [];
int local_range  [];

{ int local_size [3];  /* cyclic has stride, so three entries */

  dalib_cyclic_size (P, pid, global_size, local_size);

  dalib_intersect_range_slice (local_size, global_slice, local_range);

} /* dalib_cyclic_slice */

/*******************************************************************
*                                                                  *
*  void dalib_cyclic_range (P, pid, global_size,                   *
*                           global_section, local_section)         *
*                                                                  *
*  - compute local part of global section on pid, 1 <= pid <= P    *
*  - dimension of global_size is distributed CYCLIC(1) on P procs  *
*                                                                  *
*******************************************************************/

void dalib_cyclic_range (P, pid, global_size, global_section, local_section)

int P, pid;
int global_size [];
int global_section [];
int local_section [];

{ int local_size [3];  /* local part on processor pid */

  dalib_cyclic_size (P, pid, global_size, local_size);

  dalib_intersect_sections (global_section, local_size, local_section);

} /* dalib_cyclic_range */

/**************************************************************************
*                                                                         *
*  int dalib_cyclic_owner (int P, int lb, int ub, int k)                  *
*                                                                         *
*    - dimension of shape lb:ub is cyclic(1) distributed onto P procs     *
*    - finds owner of k, lb <= k <= ub, with 0 <= pid < P                 *
*                                                                         *
**************************************************************************/

int dalib_cyclic_owner (P, lb, ub, k)

int P, lb, ub, k;

{
#ifdef DEBUG
  printf ("%d : cyclic owner for P = %d, range = %d:%d, val = %d\n",
           pcb.i, P, lb, ub, k);
#endif
  return ( (k - lb) % P );
}

/**************************************************************************
*                                                                         *
*  void dalib_cyclic_addresses (int P, int B, int lb, int ub,             *
*                               int N, int index[], int owner[])          *
*                                                                         *
*  - [lb:ub] is distributed BLOCK (B) onto P processors                   *
*  - add owner value (0 <= owner < P) of index [i] to owner[i]            *
*  - mask is hidden in owner (-1 for mask[i] is true)                     *
*                                                                         *
**************************************************************************/

void dalib_cyclic_addresses (P, lb, ub, N, index, owner)

int P, lb, ub, N;
int index[];
int owner[];

{ int i;

#pragma vdir nooverlap(owner,index)
  for (i=0; i<N; i++)
     if (owner[i] >= 0) owner[i] += (index[i] - lb) % P; 

#ifdef DEBUG
  printf("%d: cyclic_addresses -> ", pcb.i);
  for (i=0; i<N; i++) printf ("(%d,%d) ", owner[i], index[i]);
  printf ("\n");
#endif

} /* dalib_cyclic_addresses */

/**************************************************************************
*                                                                         *
*  int dalib_gen_block_owner (int offsets, int P, int k)                  *
*                                                                         *
*  Input:   offsets[0..P-1], e.g.  [4, 11, 31, 61, 101]                   *
*           P  (number of entries in P), e.g. 5                           *
*           k  index value to be searched  0 <= k < offsets[P-1]          *
*                                                                         *
*  Output:  pid with offsets[pid-1] <= k < offsets[pid]                   *
*           0 <= pid < P                                                  *
*                                                                         *
*  Examples :  k = 10 -> 1, k = 45 -> 3, k = 88 -> 4, k = 2 -> 0          *
*                                                                         *
**************************************************************************/

int dalib_gen_block_owner (offsets, P, k)

int offsets[];
int k;

{ int pid1, pid2, p;

#ifdef DEBUG
  printf ("dalib_gen_block_owner, P = %d, k = %d, offsets = ", P, k);
  for (p=0; p<P; p++) printf ("%d ", offsets[p]);
#endif

  /* offsets[pid-1] <= k < offsets[pid] => pid is the processor */

  pid1 = 0;
  pid2 = P-1;

  if (k < 0) return (-1);               /* out of range */
  if (k >= offsets [P-1]) return (-1);  /* out of range */

  /* P = 1, pid1 = 0, pid2 = 0 -> ready                                  */
  /* P = 2, pid1 = 0, pid2 = 1 -> p = 1, k < offsets[0], k >= offsets[1] */

  while (pid1 < pid2)

    { p = (pid1 + pid2 + 1) / 2;
      if (k < offsets[p-1]) pid2 = p-1;
       else if (k >= offsets[p])  pid1 = p+1;
       else { /* found : */ pid1 = p; pid2 = p; }
    }

  return (pid1);

} /* dalib_gen_block_owner */

/**************************************************************************
*                                                                         *
*  void dalib_gen_block_addresses (int P, int lb, int ub, int offsets[],  *
*                                  int N, int index[], int owner[])       *
*                                                                         *
*  - [lb:ub] is distributed BLOCK (B) onto P processors                   *
*  - add owner value (0 <= owner < P) of index [i] to owner[i]            *
*  - mask is hidden in owner (-1 for mask[i] is true)                     *
*                                                                         *
**************************************************************************/

void dalib_gen_block_addresses (P, lb, ub, offsets, N, index, owner)

int P, lb, ub;
int offsets[];

int index[];
int owner[];
int N;

{ int i;

  for (i=0; i<N; i++)

   if (owner[i] >= 0)

       { int val;

         val = index[i];

         if (val < lb) owner[i] = -1;
          else if (val > ub) owner[i] = P;
          else owner[i] += dalib_gen_block_owner (offsets, P, index[i]-lb);
       }

#ifdef DEBUG
  printf("%d: genblock_addresses -> ", pcb.i);
  for (i=0; i<N; i++) printf ("(%d,%d) ", owner[i], index[i]);
  printf ("\n");
#endif

} /* dalib_gen_block_addresses */

/**************************************************************************
*                                                                         *
*  int dalib_offset_owner (int offsets, int P, int k)                     *
*                                                                         *
*  Input:   offsets[0..P-1], e.g.  [0, 50, 70, 10]                        *
*           P  (number of entries in P), e.g. 4                           *
*           k  index value to be search                                   *
*                                                                         *
*  Output:  pid with offsets[pid-1] <= k (maximal one)                    *
*           0 <= pid < P                                                  *
*                                                                         *
*  Examples :  k = 11 -> 3, k = 45 -> 3, k = 88 -> 2                      *
*                                                                         *
**************************************************************************/

int dalib_offset_owner (offsets, P, k)

int offsets[];
int k;

{ int pid, p_max;
  int val;

#ifdef DEBUG
  printf ("dalib_offset_owner, P = %d, k = %d, offsets = ", P, k);
  for (pid=0; pid<P; pid++) printf ("%d ", offsets[pid]);
  printf ("\n");
#endif

  /* offsets[pid-1] <= k => pid is the processor */

  p_max = -1;    /* nothing found so far */

  for (pid = 0; pid < P; pid++)

    { val = offsets[pid];

      if (val <= k)

         { if (p_max == 0) 
               p_max = pid;
            else if (offsets[p_max] < offsets[pid])
               p_max = pid;
         }
    }

  return (p_max);

} /* dalib_offset_owner */

/*******************************************************************
*                                                                  *
*  void dalib_gen_block_range (int offsets[P], int P, int pid,     *
*                              int global_size[2],                 *
*                              int global_section[3],              *
*                              int local_section[3])               *
*                                                                  *
*   - returns local part on processor pid of P, 1 <= pid <= P      *
*   - dimension is gen_block distributed, offsets available        *
*                                                                  *
*******************************************************************/

void dalib_gen_block_range (offsets, P, pid, global_size,
                            global_section, local_section)
 
int offsets[];
int P, pid;
int global_size[];
int global_section[];
int local_section[];
 
{ int local_size [2];  /* local part on processor pid */

  /* local size is directly available by the offsets */

  if (pid <= 1)
     local_size[0] = global_size[0];
   else
     local_size[0] = global_size[0] + offsets[pid-2];

  local_size[1] = global_size[0] + offsets[pid-1] - 1;

  dalib_intersect_range_slice (global_section, local_size, local_section);
 
} /* dalib_gen_block_range */
 
/*******************************************************************
*                                                                  *
*  void dalib_block_cyclic_offsets (int P, int Bsize, int lb,      *
*                                   int ub, int offsets[]   )      *
*                                                                  *
*  Example:  P = 3, Bsize = 5, lb = 1, ub = 23                     *
*                                                                  *
*            sizes   :     10   8   5                              *
*            offsets :     10  18  23                              *
*                                                                  *
*******************************************************************/

void dalib_block_cyclic_offsets (P, N, Bsize, offsets)

int P, Bsize;
int N; 
int offsets[];

{ int pid;
  int i, no_chunks;
  int chunk_size;

  no_chunks = (N + Bsize -1) / Bsize;

#ifdef DEBUG
  printf ("%d: offsets for CYCLIC(%d), P=%d, chunks=%d\n",
           pcb.i, Bsize, P, no_chunks);
#endif

  /* N = 23, P = 3, Bsize = 5 => no_chunks = 5 , sizes = 10 8 5 */

  for (pid=0; pid<P; pid++) offsets[pid] = 0;  /* sizes = 0 */

  chunk_size = Bsize; pid = 1;

  for (i=1; i<=no_chunks; i++)

    { /* last chunk can have smaller Bsize */

      if (i == no_chunks)  chunk_size = N - ((i-1)*chunk_size);
 
      offsets[pid-1] += chunk_size;

      pid++; if (pid > P) pid = 1;
    }

  /* offsets contains sizes, so we build now the running sum */

  for (pid=1; pid<P; pid++) offsets[pid] += offsets[pid-1];

  if (offsets[P-1] != N)

     { dalib_internal_error ("block_cyclic_offsets : wrong calculation");
       printf ("%d: N = %d, offsets[%d] = %d\n", 
               pcb.i, N, P, offsets[P-1]);
       dalib_stop ();
     }

#ifdef DEBUG
  printf ("%d: offsets (processor) for CYCLIC(%d) = ", pcb.i, Bsize);
  for (pid=0; pid<=P; pid++) printf ("%d ", offsets[pid]);
  printf ("\n");
#endif

} /* dalib_block_cyclic_offsets */

/*******************************************************************
*                                                                  *
*  val1  =  (chunk_local * P + p)  * size + chunk_offset           *
*                                                                  *
*  -> pid = p+ 1 , offset = chunk_local * size + chunk_offset      *
*                                                                  *
*******************************************************************/

void dalib_block_cyclic_local (P, size, lb, ub, g_index, offsets, l_index)

int P, size, lb, ub, g_index;
int offsets[];
int *l_index;

{ int val1;
  int chunk_global, chunk_offset;
  int chunk_local;
  int pid, p_offset;

  val1 = g_index - lb;

  /* global chunk:  val1  =  chunk_global * size + chunk_offset   */

  chunk_global  = val1 / size;
  chunk_offset  = val1 - chunk_global * size;

  /* local chunk:   chunk_global = chunk_local * P + pid   */

  chunk_local  = chunk_global / P;
  pid          = chunk_global - chunk_local * P;  /* 0 <= pid < P */

  p_offset = chunk_local * size + chunk_offset;

  if (pid == 0)
     *l_index = lb + p_offset; 
    else
     *l_index = lb + offsets[pid-1] + p_offset; 

#ifdef DEBUG
  printf ("%d: cylic(%d), P=%d, I=%d of %d:%d => (%d,%d) => (%d,%d) => %d\n",
           pcb.i, size, P, g_index, lb, ub, 
           chunk_global, chunk_offset, pid+1, p_offset, *l_index);
#endif

} /* dalib_block_cyclic_local */

/**************************************************************************
*                                                                         *
*  void dalib_block_cyclic_global (int P, int B, int lb, int ub,          *
*                                  int l_index, int offsets[P],           *
*                                  int *g_index)                          *
*                                                                         *
*  - transform local index l_index to g_index                             *
*  - dimension is distributed CYCLIC(P) onto lb:ub                        *
*  - offsets is running sum on sizes of the different processors          *
*                                                                         *
**************************************************************************/

void dalib_block_cyclic_global (P, B, lb, ub, l_index, offsets, g_index)

int P, B, lb, ub;
int l_index;
int offsets[];
int *g_index;

{ int chunk_local;
  int chunk_offset;
  int chunk_global;
  int pid;
  int p_offset;

  pid = dalib_gen_block_owner (offsets, P, l_index-lb);  /* 0 <= pid < P */

  /* 0 <= pid < P, now found the offset within pid */

  if (pid == 0)
     p_offset = l_index - lb;
    else
     p_offset = l_index - lb - offsets[pid-1];

  /* p_offset = chunk_local * B + chunk_offset */

  chunk_local  = p_offset / B;
  chunk_offset = p_offset - chunk_local * B;

  chunk_global = chunk_local * P + pid;

  *g_index = lb + chunk_global * B + chunk_offset;

#ifdef DEBUG
  printf ("%d: cylic(%d), P=%d, I=%d of %d:%d <= (%d,%d) <= (%d,%d) <= %d\n",
           pcb.i, B, P, *g_index, lb, ub,
           chunk_global, chunk_offset, pid, p_offset, l_index);
#endif

} /* dalib_block_cyclic_global */

/**************************************************************************
*                                                                         *
*  int dalib_block_cyclic_owner (int P, int B, int lb, int ub, int val)   *
*                                                                         *
*   - returns pid, owner of val  (0 <= pid < P), -1 for error             *
*   - [lb:ub] is distributed CYCLIC(B) onto P processors                  *
*                                                                         *
**************************************************************************/

int dalib_block_cyclic_owner (P, B, lb, ub, val)

int P, B, lb, ub, val;

{ int val1;
  int pid;
  int chunk;

  val1 = val - lb;

  chunk = val1/B;

  pid = chunk % P;

#ifdef DEBUG
  printf ("%d: cyclic (%d) onto %d procs, %d of %d:%d -> pid = %d\n",
           pcb.i, B, P, val, lb, ub, pid);
#endif

  return pid;

} /* dalib_block_cyclic_owner */

/**************************************************************************
*                                                                         *
*  void dalib_block_cyclic_addresses (int P, int B, int lb, int ub, ...)  *
*                                                                         *
*  - [lb:ub] is distributed CYCLIC(B) onto P processors                   *
*  - offsets [pid] is local offset on processor pid                       *
*                                                                         *
*  ATTENTION: side effects on index (global -> local)                     *
*                                                                         *
**************************************************************************/

void dalib_block_cyclic_addresses (P, B, lb, ub, offsets, N, index, owner)

int P, B, lb, ub;
int offsets[];

int index[];
int owner[];
int N;

{ int global_val;
  int local_val;
  int chunk_global;
  int chunk_local;
  int chunk_offset;
  int i;

#pragma vdir nooverlap (index,offsets,owner)
  for (i=0; i<N; i++)

    { int pid;     /* owner of index[i], 0 <= pid < P */

      global_val   = index[i] - lb;

      chunk_global = global_val / B;
      chunk_offset = global_val - chunk_global * B;

      chunk_local  = chunk_global / P;
      pid          = chunk_global - chunk_local * P;

      local_val    = chunk_local * B + chunk_offset;

      owner[i]     += pid;

      if (pid == 0)
         index[i] = lb + local_val;
        else
         index[i] = lb + offsets[pid-1] + local_val;

    }

#ifdef DEBUG
  printf("%d: block_cyclic_addresses -> ", pcb.i);
  for (i=0; i<N; i++) printf ("(%d,%d) ", owner[i], index[i]);
  printf ("\n");
#endif

} /* dalib_block_cyclic_addresses */

/**************************************************************************
*                                                                         *
*   dalib_gen_block_offsets (int P, int N, int sizes[], int offsets[])    *
*                                                                         *
*   - calculate offsets given by the sizes of the dimension               *
*                                                                         *
*                      1     2      3      4      5     P-1   P           *
*                                                                         *
*   sizes              5     6     7      8      9     10     11          *
*   axis_offsets       5    11     18     26     35     45    56          *
*                                                                         *
**************************************************************************/

void dalib_gen_block_offsets (P, N, sizes, offsets)

int P;
int N;             /* number of total elements in dimensions */
int sizes[];
int offsets [];

{ int pid;

  /* build the running sums for axis offsets */

  offsets[0] = sizes[0];

  for (pid=1; pid<P; pid++) offsets[pid] = offsets[pid-1] + sizes[pid];

  /* now check whether sizes of the blocks sum up correctly */

  if (offsets[P-1] < N)       /*  sizes do not sum up to total size */

    { dalib_internal_error ("block sizes do not sum up to total size");
      printf ("dim size is     : %d\n", N);
      printf ("sum of blocks   : %d\n", offsets[P-1]);
      dalib_stop ();
    }

   else if (offsets[P-1] > N)  /*  too big, so we have to cut, but no error */

    { for (pid=0; pid<P; pid++)
        if (offsets[pid] > N) offsets[pid] = N;
    }

#ifdef DEBUG
  printf ("%d: offsets (processor) for GENBLOCK = ", pcb.i);
  for (pid=0; pid<P; pid++) printf ("%d ", offsets[pid]);
  printf ("\n");
#endif

} /* dalib_gen_block_offsets */

/**************************************************************************
*                                                                         *
*  HELP ROUTINES for calculating offsets                                  *
*                                                                         *
*  void dalib_reset_offsets (int offsets[], int P)                        *
*                                                                         *
*   - recalculate the offsets, e.g.  4 8 16 16 -> 0 4 8 16                *
*                                                                         *
*  void dalib_build_offsets (int offsets[], int P)                        *
*                                                                         *
*   - build offsets by sizes, e.g. 0  5  3  8  4 ->  0  5  8  16  20      *
*                                                                         *
**************************************************************************/

static void dalib_reset_offsets (offsets, P)

int offsets[];   /* has at least P + 1 entries */
int P;

{ int pid;
  int old, offset;

  offset = 0;

  for (pid=0; pid<P; pid++)

     { old = offsets[pid];
       offsets[pid] = offset;
       offset = old;
     }

} /* dalib_reset_offsets */

static void dalib_build_offsets (offsets, P)

int offsets[];
int P;

{ int pid;

  /* now build the running sums for the offsets, must sum up to N */

  offsets[0] = 0;

  for (pid=1; pid<=P; pid++) offsets[pid] += offsets[pid-1];

} /* dalib_build_offsets */

/**************************************************************************
*                                                                         *
*                                                                         *
*                                                                         *
*                                                                         *
*                                                                         *
**************************************************************************/

void dalib_mapping_offsets (P, mapping, lb, ub, offsets, perm, inv_perm)

/* Input arguments : */

int P;
int mapping [];     /* should have at least N (ub - lb + 1) entries */
int lb, ub;         /* range that is mapped                         */
int offsets [];     /* should have at least P+1 entries             */
int perm[];         /* should have N entries                        */
int inv_perm[];     /* should have N entries                        */

{ int N;   /* number of elements in the mapped dimension */
  int pid;
  int i, h;

  N = ub - lb + 1;

  /* calculate number of elements at first */

#ifdef CHECK
  for (i=0; i<N; i++)
     if ( (mapping[i] > P) || (mapping[i] < 1) )
          printf ("illegal MAP array, %d is not between 1 and P = %d\n",
                   mapping[i], P);
#endif

  for (pid=0; pid<=P; pid++) offsets[pid] = 0;

  for (i=0; i<N; i++) offsets[mapping[i]]++;

  /* now build the running sums for the offsets, must sum up to N */

  dalib_build_offsets (offsets, P);

  if (offsets[P] != N)

    { dalib_internal_error ("illegal MAP array for indirect addressing");
      dalib_stop ();
    }

  /* now build the permuation vector to map global to local index */

  for (i=0; i<N; i++)

      /* global element i will be placed to offsets[mapping[i]-1] */

      perm[i] = offsets[mapping[i]-1]++;

#ifdef DEBUG
  for (i=0; i<N; i++)
      printf ("element %d is mapped pos %d\n", i+lb, perm[i]+lb);
#endif

  /* recalculate the offsets, e.g.  4 8 16 16 -> 0 4 8 16 */

  dalib_reset_offsets (offsets, P);

  /* now build the inverse permuation to map local to global index */

  for (i=0; i<N; i++) inv_perm[perm[i]] = i;

#ifdef DEBUG
  for (i=0; i<N; i++)
     printf ("local element %d was global at %d\n", i+lb, inv_perm[i]+lb);
#endif

#ifdef DEBUG
  printf ("%d: offsets (processor) for INDIRECT = ", pcb.i);
  for (pid=0; pid<=P; pid++) printf ("%d ", offsets[pid]);
  printf ("\n");
#endif

} /* dalib_mapping_offsets */

/**************************************************************************
*                                                                         *
*  void dalib_indirect_addresses (int P, int lb, int ub,                  *
*                                 int offsets[], int perm[],              *
*                                 int N, int index[], int owner[])        *
*                                                                         *
*  - [lb:ub] is distributed INDIRECT (map) onto P processors              *
*  - perm[i-lb] + lb gives new position after reordering                  *
*  - add owner value (0 <= owner < P) of index [i] to owner[i]            *
*  - mask is hidden in owner (-1 for mask[i] is true)                     *
*  - ATTENTION : new local index values available in index                *
*                                                                         *
**************************************************************************/

void dalib_indirect_addresses (P, lb, ub, offsets, perm, N, index, owner)

int P;
int lb, ub;
int offsets[];
int perm[];

int index[];
int owner[];
int N;

{ int pid;
  int i;
  int new_addr;

  for (i=0; i<N; i++)

    if (owner[i] >= 0)

       { new_addr = perm[index[i]-lb] + lb;
         pid      = dalib_gen_block_owner (offsets, P, new_addr);
         index[i] = new_addr;
         owner[i] += pid;
       }

#ifdef DEBUG
  printf("%d: indirect_addresses -> ", pcb.i);
  for (i=0; i<N; i++) printf ("(%d,%d) ", owner[i], index[i]);
  printf ("\n");
#endif

} /* dalib_indirect_addresses */

/**************************************************************************
*                                                                         *
*                                                                         *
*                                                                         *
*  Example:                                                               *
*                                                                         *
*   len (map)    :   10 (1)  20 (2)  30 (2)  40 (1)                       *
*                                                                         *
*   P_sizes      :    50    50                                            *
*   P_offsets    :     0    50    100                                     *
*                                                                         *
*   global_offsets :    0      10      30      60     100                 *
*   local_offsets  :    0      50      70      10                         *
*                                                                         *
*   permutation    :    1       4       2       3                         *
*                                                                         *
*                      10      40      20      30                         *
*                       0      10      50      70                         *
*                                                                         *
**************************************************************************/

void dalib_arbitrary_offsets (P, no_chunks, chunk_sizes, chunk_map, lb, ub, 
                              offsets, chunk_g_offsets, chunk_l_offsets)

/* Input arguments : */

int P;
int no_chunks;
int chunk_sizes[];   /* should have at least no_chunks entries */
int chunk_map[];     /* should have at least no_chunks entries */
int lb, ub;          /* range that is mapped                   */

/* Output arguments : */

int offsets[];           /* should have at least P+1 entries          */
int chunk_g_offsets[];   /* should have at least no_chunks+1 entries  */
int chunk_l_offsets[];   /* should have at least no_chunks+1 entries  */

{ int N;   /* number of elements in the mapped dimension */

  int pid;  
  int i;
  int size;
  int total;

  N = (ub - lb + 1);     /* number of elements in mapped dimension */

  /* initialize sizes to 0 */

  for (pid=0; pid<=P; pid++) offsets [pid] = 0;

  /* compute local sizes on every processor, verify map array */

  total = 0;

  for (i=0; i<no_chunks; i++)

     { pid  = chunk_map[i];
       size = chunk_sizes[i];

       if ((pid < 1) || (pid > P))

          { dalib_internal_error ("illegal value in map array for ARBITRARY");
            printf ("map = %d, but only 1 - %d is possible\n", pid, P);
            dalib_stop ();
          }

       /* do not care about too large sizes in the size array */

       if (total + size > N) size = N - total;

       offsets [pid] += size;
       total         += size;

     }

  /* but take care that size array sums really up to N */

  if (total < N)

    { dalib_internal_error ("size array for ARBITRARY sums not up to dimsize");
      printf ("dimsize = %d (%d:%d), but sum = %d\n", 
               N, lb, ub, chunk_g_offsets[no_chunks]);
      dalib_stop ();
    }

  /* compute offsets along the processors */

  dalib_build_offsets (offsets, P);

#ifdef DEBUG
  printf ("%d: offsets for ARBITRARY = ", pcb.i);
  for (pid=0; pid<=P; pid++) printf ("%d ", offsets[pid]);
  printf ("\n");
#endif

  /* compute global and local chunk offsets */

  chunk_g_offsets[0] = 0;

  for (i=0; i<no_chunks; i++)

     { pid  = chunk_map [i];
       size = chunk_sizes [i];
       chunk_g_offsets[i+1] = chunk_g_offsets [i] + size;
       chunk_l_offsets[i] = offsets[pid-1];
       offsets[pid-1] += size;
#ifdef DEBUG
  printf ("%d: ARBITRARY chunk %d (map=%d), size = %d, offset = %d / %d\n",
          pcb.i, i+1, pid, size, chunk_g_offsets[i], chunk_l_offsets[i]);
#endif
     }

  /* compute offsets again as information is lost */

  dalib_reset_offsets (offsets, P);

#ifdef DEBUG
  printf ("%d: offsets (global chunks) for ARBITRARY = ", pcb.i);
  for (i=0; i<=no_chunks; i++) printf ("%d ", chunk_g_offsets[i]);
  printf ("\n");
  printf ("%d: offsets (local  chunks) for ARBITRARY = ", pcb.i);
  for (i=0; i<no_chunks; i++) printf ("%d ", chunk_l_offsets[i]);
  printf ("\n");
  printf ("%d: offsets (processor) for ARBITRARY = ", pcb.i);
  for (pid=0; pid<=P; pid++) printf ("%d ", offsets[pid]);
  printf ("\n");
#endif

} /* dalib_arbitrary_offsets */

/**************************************************************************
*                                                                         *
*  void dalib_arbitrary_addresses (int P, int lb, int ub,                 *
*                                  int no_chunks, int global_offsets[],   *
*                                  int local_offsets[], int P_offsets[]   *
*                                  int N, int index[], int owner[])       *
*                                                                         *
*  - [lb:ub] is distributed INDIRECT (map) onto P processors              *
*  - add owner value (0 <= owner < P) of index [i] to owner[i]            *
*  - mask is hidden in owner (-1 for mask[i] is true)                     *
*  - ATTENTION : new local index values available in index                *
*                                                                         *
*  - P_offsets is running sum of sizes, e.g.  4  15  21  39               *
*                                                                         *
**************************************************************************/

void dalib_arbitrary_addresses (P, lb, ub, no_chunks, global_offsets,
                                local_offsets, P_offsets, N, index, owner)

int P;
int lb, ub;
int no_chunks;
int global_offsets[];
int local_offsets[];
int P_offsets[];

int index[];
int owner[];
int N;

{ int i;

  for (i=0; i<N; i++)

   if (owner[i] >= 0)   /* otherwise it was masked */

     { int chunk;    /* 1 <= chunk <= no_chunks, chunk containing index */
       int offset;   /* 0 <= offset < length(chunk), offset in chunk    */

       int val;      /* global index values, local index values         */
       int pid;      /* owner of val, 0 <= pid < P                      */

       /* step 1:  find for val the chunk and offset in chunk            */

       val        = index[i] - lb;   /* calculate in 0..ub-lb */
       chunk      = dalib_gen_block_owner (global_offsets+1, no_chunks, val);
       offset     = val - global_offsets [chunk];

        /* step 2:  look for local offset and return result  */

       val  = local_offsets[chunk] + offset;
         
       pid = dalib_gen_block_owner (P_offsets, P, val);

       index[i] = val + lb;
       owner[i] += pid;

     }  /* for all index values */
         
#ifdef DEBUG
  printf("%d: arbitrary_addresses -> ", pcb.i);
  for (i=0; i<N; i++) printf ("(%d,%d) ", owner[i], index[i]);
  printf ("\n");
#endif

} /* dalib_arbitrary_addresses */
