/**************************************************************************
*                                                                         *
*  Author      : Frederic Desprez, LIP, ENS, Lyon, France                 *
*                Thomas Brandes, GMD, SCAI.LAB, Germany                   *
*                Julien Zory, LIP, ENS, Lyon, France                      *
*                                                                         *
*  Copyright   : GMD St. Augustin, Germany + ENS, Lyon, France            *
*                                                                         *
*  Date        : Jan 95                                                   *
*  Last Update : Feb 96                                                   *
*                                                                         *
*  This Module is part of the DALIB                                       *
*                                                                         *
*  Module      : loccs.c                                                  *
*                                                                         *
*  Function: Low Overhead Communication and Computation Subroutines       *
*                                                                         *
*  Export :  FORTRAN Interface                                            *
*  ===========================                                            *
*                                                                         *
*   FUNCTION(dalib_loccs_anti) (section, overlap)                         *
*                                                                         *
*   FUNCTION(dalib_loccs_driver) (local_routine, n, m,                    *
*                                 a1, d1, ...., a<n>, d<n>,               *
*                                 b1, b2, ..., b<m>)                      *
*                                                                         *
*  To be done:                                                            *
*                                                                         *
*    - arguments d<i> must now be replicated, contiguous                  *
*    - automatic determination of BLOCK_SIZE                              *
*    - loccs subroutines for other communication patterns                 *
*                                                                         *
*  Updates:                                                               *
*                                                                         *
*    02/96 : multidimensional sections, many communication arguments      *
*    06/96 : multidimensional pipelining, own routine for choosing        *
*            dimensions for pipelining (with Julien Zory, LIP, Lyon)      *
*                                                                         *
**************************************************************************/

#include "dalib.h"

#undef DEBUG

#define BLOCK_SIZE  100 
#define MAX_SECTIONS  2
#define MAX_ARGUMENTS 6

/**************************************************************************
*                                                                         *
*  GLOBAL DATA FOR LOCCS SHIFT ROUTINE                                    *
*                                                                         *
*     comm_sections [i]  for i = 0, no_sections-1                         *
*                                                                         *
**************************************************************************/

typedef struct

  { section_info section_id;

    int no_ddims;  /* number of distributed dimensions */
    int no_sdims;  /* number of serial      dimensions */
    int no_pdims;  /* number of pipelined   dimensions */

    int distributed_dim [MAX_RANK];
    int serial_dim      [MAX_RANK];
    int pipelining_dim  [MAX_RANK];

    int job_low [MAX_RANK];
    int job_high[MAX_RANK];   /* lower and upper bound of pipelined dim */

    int overlap [MAX_DIMENSIONS];

    int p_send          [MAX_RANK];
    int p_recv          [MAX_RANK];
    dd_type ddt_send    [MAX_RANK];
    dd_type ddt_recv    [MAX_RANK];

  } pipeline_section;

pipeline_section comm_sections [MAX_SECTIONS];

int no_sections;    /* number of sections with communication       */

int no_arguments;   /* total number of arguments for block routine */

section_info *argument_ids  [MAX_ARGUMENTS];
void         *argument_data [MAX_ARGUMENTS];

int size_block;

/**************************************************************************
*                                                                         *
*  dalib_loccs_shift_range (int *job_size, int *block_size)               *
*                                                                         *
*     - computes the size of the job and an (initial) block size          *
*                                                                         *
**************************************************************************/

static void dalib_loccs_shift_range (no_pdims, job_size, block_size)

int *no_pdims;
int block_size[], job_size[];

{  int i, job_dim;
   int job_low, job_high;

   *no_pdims = comm_sections[0].no_pdims;

   for (i=0; i < (*no_pdims); i++)

      { block_size[i] = size_block;

        job_low  = comm_sections[0].job_low[i];
        job_high = comm_sections[0].job_high[i];

#ifdef DEBUG
        printf ("%d: my pipeline range : %d - %d\n", pcb.i, job_low, job_high);
#endif
        job_size  [i] = job_high - job_low + 1;

      } /* for all dimensions used for pipelining */

}  /* dalib_loccs_shift_range */

/**************************************************************************
*                                                                         *
*   dalib_loccs_set_job (job_low, job_high)                               *
*                                                                         *
*     - job_low:job_high as part of 0:job_size-1                          *
*     - sets everything globally in comm_sections for commun/execution    *
*                                                                         *
**************************************************************************/

static void dalib_loccs_set_job (no_pdims, job_low, job_high)

int no_pdims;     /* number of dimensions used for pipelining */
int job_low[];
int job_high[];

{ int i, pdim;
  pipeline_section *sec;
  int ddim, no_ddims;
  int job_dim;          /* absolute pipelining dimension of section */

  sec = comm_sections;

#ifdef DEBUG
  printf ("%d: set pdim (rel) %d to %d - %d for %d section\n", 
           pcb.i, 0, job_low[0], job_high[0], no_sections);
#endif

  for (i=0; i<no_sections; i++, sec++)

    { int block_low, block_high;

      /* compute the part of the pipelined dimension */

      for (pdim=0; pdim < no_pdims; pdim++)

        { block_low  = sec->job_low[pdim] + job_low[pdim];
          block_high = sec->job_low[pdim] + job_high[pdim];

          job_dim    = sec->pipelining_dim[pdim];

#ifdef DEBUG
      printf ("%d: set dim (abs) %d, section %d has range : %d - %d\n",
      pcb.i, job_dim, i, block_low, block_high);
#endif

          sec->section_id->dimensions[job_dim-1].local_range[0] = block_low;
          sec->section_id->dimensions[job_dim-1].local_range[1] = block_high;

        }

      no_ddims = sec->no_ddims;

      for (ddim = 0; ddim < no_ddims; ddim++)

        { int distributed_dim, overlap;

          distributed_dim = sec->distributed_dim[ddim];
          overlap         = sec->overlap[distributed_dim-1];
    
          if (overlap == 0)
    
            { sec->ddt_send[ddim] = NO_DDT;
              sec->ddt_recv[ddim] = NO_DDT;
#ifdef DEBUG
              printf ("%d: set comm %d of %d, no overlap\n",
                      pcb.i, ddim, no_ddims);
#endif
            }

           else

            { dalib_section_overlap 

                 (sec->section_id, distributed_dim, overlap,
                  (sec->p_send)+ddim, (sec->ddt_send)+ddim,
                  (sec->p_recv)+ddim, (sec->ddt_recv)+ddim);

#ifdef DEBUG
      printf ("%d: comm %d of %d, overlap = %d for dim %d, %d <- me -> %d\n", 
              pcb.i, ddim, no_ddims, overlap, distributed_dim,
              sec->p_recv[ddim], sec->p_send[ddim]);
      dalib_print_section_info (sec->section_id);
      dalib_ddt_print (sec->ddt_recv[ddim]);
      dalib_ddt_print (sec->ddt_send[ddim]);
#endif

                 /* note : ddt_recv is empty for the first processor,
                           ddt_send is empty for the last  processor   */

            } /* with overlap */

       } /* for all distributed dimensions */
      
    }  /* for all sections */

} /* loccs_set_job */

/**************************************************************************
*                                                                         *
*  dalib_loccs_receive ()                                                 *
*                                                                         *
**************************************************************************/

static void dalib_loccs_receive ()

{ int i;
  pipeline_section *sec;
 
  sec = comm_sections;
 
#ifdef DEBUG
  printf ("%d: dalib_loccs_receive for %d sections\n", pcb.i, no_sections);
#endif

  for (i=0; i<no_sections; i++, sec++)

    { int ddim, no_ddims;

      no_ddims = sec->no_ddims;
 
      for (ddim = 0; ddim < no_ddims; ddim++)
 
       { dd_type ddt_recv;
         int     p_recv;

         ddt_recv = sec->ddt_recv[ddim];
         p_recv   = sec->p_recv[ddim];

         if (ddt_recv != NO_DDT)

          {
#ifdef DEBUG
            printf ("%d: receive (sec=%d,dim=%d) from %d\n",
                    pcb.i, i, ddim, p_recv);
#endif 
            dalib_recv_ddt_op (p_recv, ddt_recv, 0);
#ifdef DEBUG
            printf ("%d: has been received\n", pcb.i);
#endif
            dalib_ddt_free (ddt_recv);
          }

       } /* for all distributed dims */

    }  /* for all sections */

} /* dalib_loccs_receive  */

/**************************************************************************
*                                                                         *
*  dalib_loccs_send ()                                                    *
*                                                                         *
*    - sends the boundary data to the next processor(s)                   *
*                                                                         *
**************************************************************************/
 
static void dalib_loccs_send ()
 
{ int i;
  pipeline_section *sec;
 
  sec = comm_sections;
 
#ifdef DEBUG
  printf ("%d: dalib_loccs_send for %d sections\n", pcb.i, no_sections);
#endif

  for (i=0; i<no_sections; i++, sec++)
 
    { int ddim, no_ddims;
 
      no_ddims = sec->no_ddims;
 
      for (ddim = 0; ddim < no_ddims; ddim++)
 
       { dd_type ddt_send;
         int     p_send;
 
         ddt_send = sec->ddt_send[ddim];
         p_send   = sec->p_send[ddim];
 
         if (ddt_send != NO_DDT)

          {
#ifdef DEBUG
            printf ("%d: send (sec=%d,dim=%d) to %d\n",
                    pcb.i, i, ddim, p_send);
#endif 
            dalib_send_ddt (p_send, ddt_send);
#ifdef DEBUG
            printf ("%d: send done\n", pcb.i);
#endif
            dalib_ddt_free (ddt_send);
          }

       }
 
    }  /* for all sections */
 
} /* dalib_loccs_send  */

/**************************************************************************
*                                                                         *
*  JOB (i, local_job)   0 <= i < job_number                               *
*                                                                         *
**************************************************************************/

static void dalib_loccs_do_job (local_job)

void local_job ();

{ 

#ifdef DEBUG
    printf ("%d: executing job (%d args)\n", pcb.i, no_arguments);
#endif 

  switch (no_arguments) {

  case 6  : local_job (argument_data[0], argument_data[1],
                       argument_data[2], argument_data[3],
                       argument_data[4], argument_data[5],
                       argument_ids[0],  argument_ids[1],
                       argument_ids[2],  argument_ids[3],
                       argument_ids[4],  argument_ids[5]); break;
  case 5  : local_job (argument_data[0], argument_data[1],
                       argument_data[2], argument_data[3],
                       argument_data[4], argument_ids[0],
                       argument_ids[1],  argument_ids[2],
                       argument_ids[3],  argument_ids[4]); break;
  case 4  : local_job (argument_data[0], argument_data[1],
                       argument_data[2], argument_data[3],
                       argument_ids[0],  argument_ids[1],
                       argument_ids[2],  argument_ids[3]); break;
  case 3  : local_job (argument_data[0], argument_data[1],
                       argument_data[2], argument_ids[0],
                       argument_ids[1],  argument_ids[2]); break;
  case 2  : local_job (argument_data[0], argument_data[1],
                       argument_ids[0],  argument_ids[1]); break;
  case 1  : local_job (argument_data[0], argument_ids[0]); break;
  case 0  : local_job (); break;

  } /* end switch */

}  /* dalib_loccs_do_job */

/**************************************************************************
*                                                                         *
*   dalib_loccs_anti_update (pipeline_section *sec)                       *
*                                                                         *
**************************************************************************/

static void dalib_loccs_anti_update (sec)

pipeline_section *sec;

{ int i;
  int ddim, no_ddims;

  no_ddims = sec->no_ddims;

  for (ddim = 0; ddim < no_ddims; ddim++)

    { int distributed_dim, overlap;

       distributed_dim = sec->distributed_dim[ddim];
       overlap         = sec->overlap[distributed_dim-1];
       overlap         = -overlap;  /* for anti dependences */

       if (overlap != 0)

         { dd_type ddt_send, ddt_recv;
           int     p_send,   p_recv;

           dalib_section_overlap

             (sec->section_id, distributed_dim, overlap,
              &p_send, &ddt_send, &p_recv, &ddt_recv);

             /* note : ddt_recv is empty for the last  processor,
                       ddt_send is empty for the first processor   */

           if (ddt_recv != NO_DDT)
             { dalib_recv_ddt_op (p_recv, ddt_recv, 0);
               dalib_ddt_free (ddt_recv);
             }

           if (ddt_send != NO_DDT)
             { dalib_send_ddt (p_send, ddt_send);
               dalib_ddt_free (ddt_send);
             }

         } /* overlap != 0 */

     }  /* for all distributed dimensions */
      
} /* loccs_anti_update */

/**************************************************************************
*                                                                         *
*  dalib_loccs_do_shift (local_subroutine)                                *
*                                                                         *
*    - routine for shifting of data along distributed dimension           *
*                                                                         *
*              ---   parallel dimension of topology_section  ----         *
*                                                                         *
*               P1           P2           P3           P4                 *
*                                                                         *
*            |------|     |------|     |------|     |------|              *
*    |       | t=1  | 1-> | t=2  |     | t=3  |     | t=4  |              *
*    |       |      | 1-> |      |     |      |     |      |              *
*            |------|     |------|     |------|     |------|              *
*   ser      | t=2  | 2-> | t=3  |     | t=4  |     | t=5  |              *
*   ial      |      | 2-> |      |     |      |     |      |              *
*            |------|     |------|     |------|     |------|              *
*  dimen     | t=3  | 3-> | t=4  |     | t=5  |     | t=6  |              *
*   sion     |      | 3-> |      |     |      |     |      |              *
*            |------|     |------|     |------|     |------|              *
*    |       | t=4  | 4-> | t=5  |     | t=6  |     | t=7  |              *
*    |       |      | 4-> |      |     |      |     |      |              *
*            |------|     |------|     |------|     |------|              *
*                                                                         *
*    - work along serial dimension is splitted into blocks                *
*    - after finishing one block data is send to next processor           *
*    - pipelined execution of double nested loop                          *
*                                                                         *
**************************************************************************/

static int dalib_next_job (dim, job_low, job_high, job_size, block_size)

int dim;
int job_low[], job_high[];
int job_size[], block_size[];

{ if (dim > 0)

    { job_low[dim-1]  = 0;
      job_high[dim-1] = block_size[dim-1] - 1;
      if (job_high[dim-1] >= job_size[dim-1]) 
        job_high[dim-1] = job_size[dim-1] - 1;
    }

  job_low[dim]  = job_high[dim] + 1;
  job_high[dim] = job_low[dim] + block_size[dim] - 1;
  if (job_high[dim] >= job_size[dim]) job_high[dim] = job_size[dim] - 1;
  return (job_low[dim] >= job_size[dim]);

} /* dalib_next_job */

static void dalib_loccs_do_shift (local_fortran_subroutine)

void local_fortran_subroutine ();

{  int i, no_pdims;
   int job_size[MAX_RANK];
   int block_size[MAX_RANK];
   int job_low[MAX_RANK], job_high[MAX_RANK];
   int stop;

#ifdef DEBUG
   printf ("%d: call of loccs_shift (synchron communication)\n", pcb.i);
#endif

   /* get the ranges, fix number of jobs */

   dalib_loccs_shift_range (&no_pdims, job_size, block_size);

   /* job range is 0 .. job_size - 1 */

   if (no_pdims == 0)

      { 
         dalib_loccs_do_job (local_fortran_subroutine);
         return;
      }

   /* define the first block */

   stop = 0;

   for (i=0; i < no_pdims; i++)

      { job_low[i]  = 0;
        job_high[i] = block_size[i] - 1;
        if (job_high[i] >= job_size[i]) 
           job_high[i] = job_size[i] - 1;

        if (job_size[i] <= 0) stop = 1;
      }

   while (!stop)

     { /* define the job sizes : local section and communications */

       dalib_loccs_set_job (no_pdims, job_low, job_high);

       dalib_loccs_receive ();

       dalib_loccs_do_job (local_fortran_subroutine);

       dalib_loccs_send ();

       /* compute the next range for the job */

       stop = 1;

       for (i=0; i < no_pdims; i++)
         if (stop)  /* look for a job in next dimension */
           stop = dalib_next_job (i, job_low, job_high, job_size, block_size);

       /* stop is true if there is no further job in any dimension */

     } /* loop over blocks */

} /* FUNCTION(dalib_loccs_do_shift) */

/**************************************************************************
*                                                                         *
*   dalib_loccs_set_section (section_data, section_id,                    *
*                            overlap_data, overlap_id)                    *
*                                                                         *
*     - set comm_sections[i] correctly                                    *
*     - increment no_arguments                                            *
*                                                                         *
**************************************************************************/

static void dalib_loccs_set_section (section_data, section_id,
                                     overlap_data, overlap_id)

void *section_data;
int  *overlap_data;

section_info *section_id, *overlap_id;

{ int section_rank, array_rank;
  array_info array_id;
  int no_ddims;
  int no_sdims;
  int dim;

  argument_ids  [no_arguments] = section_id;
  argument_data [no_arguments] = section_data;

  /* set the communication information globally */

#ifdef DEBUG
	printf("%d : IN DALIB LOCCS_SET_SECTION \n",pcb.i);
#endif


  section_rank = dalib_section_rank (*section_id);
  array_id     = (*section_id)->array_id;
  array_rank   = array_id->rank;

#ifdef DEBUG
        printf("%d : IN DALIB LOCCS_SET_SECTION \n",pcb.i);
#endif


  if (array_rank != section_rank)
     { dalib_internal_error ("loccs shift: section must be over full rank");
       dalib_stop ();
     }

  no_ddims = 0;
  no_sdims = 0;

  for (dim=1; dim<=array_rank; dim++)

     { int top_id, top_dim;

       dalib_array_map_query (array_id, dim, &top_id, &top_dim);

       if (top_id == 0)    /* dim is a serial dimension */

         { comm_sections[no_arguments].serial_dim[no_sdims]=dim;
           no_sdims++;
         }

        else               /* dim is a distributed dimension */

         { comm_sections[no_arguments].distributed_dim[no_ddims]=dim;
           no_ddims++;
         }
     }

  comm_sections[no_arguments].section_id = *section_id;
  comm_sections[no_arguments].no_ddims   = no_ddims;
  comm_sections[no_arguments].no_sdims   = no_sdims;

  /* Attention: overlap_data must be serial, contiguous */

  for (dim=0; dim<array_rank; dim++)
     comm_sections[no_arguments].overlap[dim] = overlap_data[dim];

#ifdef DEBUG
  printf ("section argument %d : # serial dims = %d, # distr dims = %d\n",
           no_arguments+1, no_sdims, no_ddims);
#endif

  no_arguments++;

} /* dalib_loccs_set_section */

static void dalib_loccs_set_argument (data, id)

void *data;
section_info *id;

{ argument_ids  [no_arguments] = id;
  argument_data [no_arguments] = data;
  no_arguments++;

} /* dalib_loccs_set_argument */

/**************************************************************************
*                                                                         *
*   dalib_loccs_choose_pipelining ()                                      *
*                                                                         *
*     - this routine fixes the dimensions that are used for               *
*       building subblocks within the pipeline                            *
*                                                                         *
**************************************************************************/
 
void dalib_loccs_choose_pipelining ()

{ int i, j;
  int no_sdims, no_pdims, no_ddims;
  
  for (i=0; i < no_sections; i++)

     {  /* define it for every section */

        section_info section_id;
  
        section_id = comm_sections[i].section_id;

        no_sdims = comm_sections[i].no_sdims;
        no_ddims = comm_sections[i].no_ddims;

        if (no_sdims == 0)

           { /* take just one of the distributed dimensions */

             if (no_ddims ==0)

               { no_pdims = 0; }

              else

               { no_pdims = no_ddims;
                 for (j=0; j<no_ddims; j++)
                   comm_sections[i].pipelining_dim [j] = 
                     comm_sections[i].distributed_dim[no_ddims-1-j];
               }

           }

          else 

           { no_pdims = 1;
             comm_sections[i].pipelining_dim [0] = 
                comm_sections[i].serial_dim[no_sdims-1];
           }

        comm_sections[i].no_pdims = no_pdims;
        
        for (j=0; j<no_pdims; j++)
    
          { SecDimInfo *sdim;
            int pdim;

            pdim = comm_sections[i].pipelining_dim[j];
            sdim = section_id->dimensions + (pdim-1);
            comm_sections [i].job_low[j]  = sdim->local_range[0];
            comm_sections [i].job_high[j] = sdim->local_range[1];
          }
 
     }  /* for every section */

} /* dalib_loccs_choose_pipelining */

/**************************************************************************
*                                                                         *
* FUNCTION(dalib_loccs_driver) (local_routine,block, n, m,                *
*                               a1, d1, ...., a<n>, d<n>,                 *
*                               b1, b2, ..., b<m>)                        *
*                                                                         *
*    - attentions: there will be data and descriptor arguments            *
*                                                                         *
**************************************************************************/

# define N_LOCCS 16

void FUNCTION(dalib_loccs_driver) (local_routine, block, n, m,
                                   p1, p2, p3, p4, p5, p6, p7, p8,
                                   p9, p10, p11, p12, p13, p14, p15, p16)

/* Attention : kind of arguments depends on n and m */

void local_routine ();
int  *block;
int  *n, *m;
void *p1,  *p2,  *p3,  *p4,  *p5,  *p6,  *p7, *p8;
void *p9, *p10, *p11, *p12, *p13, *p14, *p15, *p16;

{ void *arguments [N_LOCCS];

  int i, n_args;

  size_block = *block;


  n_args = 2 * (*n) + (*m);   /* number of FORTRAN arguments */
  n_args = 2 * n_args + 4;    /* includes now descriptors    */

  if (n_args > N_LOCCS)

     { dalib_internal_error ("dalib_loccs_shift: too many arguments");
       dalib_stop ();
     }

  if (*n + *m > MAX_ARGUMENTS)

     { dalib_internal_error ("dalib_loccs_shift: too many arguments");
       dalib_stop ();
     }

  if (*n < 1)

     { dalib_internal_error ("dalib_loccs_shift: no pipelining section");
       dalib_stop ();
     }

  switch (n_args) {

     case 16  : arguments[15] = p16;
     case 15  : arguments[14] = p15;
     case 14  : arguments[13] = p14;
     case 13  : arguments[12] = p13;
     case 12  : arguments[11] = p12;
     case 11  : arguments[10] = p11;
     case 10  : arguments [9] = p10;
     case  9  : arguments [8] =  p9;
     case  8  : arguments [7] =  p8;
     case  7  : arguments [6] =  p7;
     case  6  : arguments [5] =  p6;
     case  5  : arguments [4] =  p5;
     case  4  : arguments [3] =  p4;
     case  3  : arguments [2] =  p3;
     case  2  : arguments [1] =  p2;
     case  1  : arguments [0] =  p1;

  } /* switch */

  /* now sort the arguments out */

  /* no_sections, no_arguments are globally set */

  no_sections = *n;
  n_args      = 2 * no_sections + (*m) + 4;

  no_arguments = 0;

  for (i=0; i < no_sections; i++)

     { /* arguments [2*i]       : pointer to the section
          arguments [2*i+1]     : pointer to dependence vector
          arguments [arg+2*i]   : descriptor of the section
          arguments [arg+2*i+1] : descriptor of the dependence vector */

       dalib_loccs_set_section 

         (arguments[2*i],   (section_info *) arguments[n_args+2*i],
         (int *) arguments[2*i+1], (section_info *) arguments[n_args+2*i+1]);

     }  /* for every section */

  /* set the other m arguments */

  for (i=2*no_sections; i<2*no_sections+(*m); i++)

     dalib_loccs_set_argument (arguments[i], 
                               (section_info *) arguments[n_args+i]);

  /* set the topology information globally */

  dalib_loccs_choose_pipelining ();

  dalib_loccs_do_shift (local_routine);

} /* dalib_loccs_driver */

/**************************************************************************
*                                                                         *
*    dalib_loccs_anti (section, [d1,...,dn])                              *
*                                                                         *
*     - updating overlap area of section according anti dependences       *
*                                                                         *
**************************************************************************/
 
void FUNCTION(dalib_loccs_anti) (section_data, overlap_data,
                                 section_dsp,  overlap_dsp)

void *section_data;
int  *overlap_data;

section_info *section_dsp, *overlap_dsp;

{ no_sections = 1;
  no_arguments = 0;

  /* take advantage of computations done in loccs_set_section */

  dalib_loccs_set_section (section_data, section_dsp,
                           overlap_data, overlap_dsp);

  dalib_loccs_anti_update (comm_sections);

} /* dalib_loccs_anti */

