/**************************************************************************
*                                                                         *
*  Author      : Dr. Thomas Brandes, GMD, SCAI.LAB                        *
*  Copyright   : GMD St. Augustin, Germany                                *
*  Date        : Oct 93                                                   *
*  Last Update : Feb 96                                                   *
*                                                                         *
*  This Module is part of the DALIB                                       *
*                                                                         *
*  Module      : structured.m4                                            *
*                                                                         *
*  Function    : Operation for structured movement                        *
*                                                                         *
*  void dalib_smove (schedule_id,                                         *
*          t_NP, t_id, t_kind, t_size, t_1, t_N, t_a, t_b,                *
*                                      t_low, t_up, t_str,                *
*          s_NP, s_id, s_kind, s_size, s_1, s_N, s_a, s_b,                *
*                                      s_low, s_up, s_str,                *
*                                                                         *
*     T[...,t_low:t_up:t_str]  =  S[...,s_low:s_up:s_str]                 *
*              of t_1:t_N                of s_1:s_N                       *
*                                                                         *
*  x_low:x_up:x_str   :  describes the section                            *
*  x_1:x_N            :  extension of the template dimension              *
*  x_NP               :  number of processors allocated to the temp dim   *
*  x_id               :  own processor position 1 <= x_id <= x_NP         *
*  x_kind             :  describes how the dimension is distributed       *
*  x_a, x_b           :  describes the alignment, x_a * I + x_b           *
*                                                                         *
*  CHANGES:                                                               *
*                                                                         *
*    08/97  : free of schedules                                           * 
*                                                                         *
**************************************************************************/

# undef DEBUG
# define CHECK

# include "dalib.h"

/*******************************************************************
*                                                                  *
*  void dalib_shift_section (int overlap_section [2],              *
*                            int t_1, int t_n, int ov_update)      *
*                                                                  *
*   - tests for circular shifting                                  *
*   - [1:N], 1  -> 0:N-1 are updated values                        *
*                                                                  *
*******************************************************************/

static void dalib_shift_section (overlap_section, t_1, t_N, ov_update)

  /* test for circular shifting, e.g.
     overlap update [1:N], 1  -> 0:N-1 are updated values */

int overlap_section[];
int t_1, t_N;
int ov_update;

{ if (ov_update > 0)

   { /* left overlap, make sure that last processor sends correctly */

     if (overlap_section[0] < t_1)
       overlap_section[1] += (t_1 - overlap_section[0]); 

           /* updated values must come from right */
   }

   else if (ov_update < 0)

   { /* right overlap, make sure that first processor sends correctly */

     if (t_N < overlap_section[1])
       overlap_section[0] -= (overlap_section[1] - t_N); 

           /* updated values must come from left */
   }

} /* dalib_shift_section */

/*******************************************************************
*                                                                  *
*                                                                  *
*******************************************************************/

static void dalib_set_comm_parts (my_section, ov_update,
                                  left_side, right_side)

int my_section[];
int left_side [];
int right_side [];

{ int my_low, my_up;
  int val;               /* absolute value of ov_update */

  /*  ov_update > 0 :  update left overlap, implies sendingt to the right
      ov_update < 0 :  update right overlap, implies sending to the left   */

  val = ov_update;
  if (ov_update < 0) val = -ov_update;

  my_low = my_section[0];
  my_up  = my_section[1];
   
  if (ov_update > 0)

    { /* left overlap update */

     /***********************************************************
     *                                                          *
     *         lb  ....  ub-val+1 ... ub                        *
     *         -   -   -   x   x   x   x                        *
     *         -   -   -   x   x   x   x                        *
     *                                                          *
     ***********************************************************/

      right_side [0] = my_up - val + 1;
      right_side [1] = my_up;

      if (right_side[0] < my_low)

         { /* I have not all values for my left neighbor */

           dalib_internal_error ("overlap too big");
           dalib_stop ();
         }


      /***********************************************************
      *                                                          *
      *         lb-val ... lb-1 lb  ... ub                       *
      *            x  x ..   x   -  ... -                        *
      *                                                          *
      ***********************************************************/
 
      left_side [0] = my_low - val;
      left_side [1] = my_low - 1;

    }

  else

   { /***********************************************************
     *                                                          *
     *  - send left part of my local part to the left proc.     *
     *                                                          *
     *         lb  .... lb+val-1  ... ub                        *
     *         x   x   x   x   -  -    -                        *
     *         x   x   x   x   -  -    -                        *
     *                                                          *
     ***********************************************************/
 
      left_side [0] = my_low;
      left_side [1] = my_low + val - 1;

      if (left_side[1] > my_up)

         { /* I have not all values for my left neighbor */

           dalib_internal_error ("overlap too big");
           dalib_stop ();
         }

      /***********************************************************
      *                                                          *
      *          lb  ....    ub  ub+1  ...  ub+val               *
      *           -  -   -   -     x    x     x                  *
      *           -  -   -   -     x    x     x                  *
      *                                                          *
      ***********************************************************/
 
      /* recv data in my right overlap area, should be enough   */
 
      right_side [0] = my_up + 1;
      right_side [1] = my_up + val;

   }
  
} /* dalib_set_comm_parts */
 
/*******************************************************************
*                                                                  *
*  dalib_ov_move                                                   *
*                                                                  *
*   - update of overlap area                                       *
*                                                                  *
*         P =  1     NId       NP          block distr on NP procs * 
*                                                                  *
*     t_1 .......................  t_N     template extent         *
*                                                                  *
*******************************************************************/
 
void dalib_ov_move (schedule_id,

                    NP, NId, t_mapping, t_1, t_N, t_base, t_stride, 
                    array_section, ov_pos)

int *schedule_id;
int NP, NId;          /* number of processors, own position */

int t_1, t_N, t_base, t_stride;

int array_section[3];        /* section for which we calculate overlap */
int ov_pos;

DistDim t_mapping;

{ int template_section[3];   /* image  of array section in template   */

  int overlap_section[3];    /* section that needs the update         */

  int global_size[2];

  int my_section[3];

  int right_side[2];
  int left_side[2];

  int send_section[3];
  int recv_section[3];

  int val;
  int left_pid, right_pid;
  int ov_update;

#ifdef DEBUG
  printf ("%d: ov_move, update = %d, sec = %d:%d, -> %d*I+%d in %d:%d\n",
           pcb.i, ov_pos, array_section[0], array_section[1], 
           t_stride, t_base, t_1, t_N);
#endif

  ov_update = ov_pos * t_stride;
  val = ov_update;
  if (ov_update < 0) val = -ov_update;

  /* check that overlap is not too big */

  if ((t_N - t_1 + 1) / NP < val)

    { dalib_internal_error ("overlap area too small");
      printf ("dim %d:%d distributed on %d processors, overlap = %d\n",
               t_1, t_N, NP, val);
      dalib_stop();
    }

  /*  ov_update > 0 :  update left overlap, implies sendingt to the right
      ov_update < 0 :  update right overlap, implies sending to the left  

      sec_low - ov_update : sec_high - ov_update  provides this data        */

  template_section[0] = array_section[0] * t_stride + t_base;
  template_section[1] = array_section[1] * t_stride + t_base;
  template_section[2] = array_section[2] * t_stride;

  /* compute the section in the template that provides the overlap          */

  overlap_section[0] = template_section[0] - ov_update;
  overlap_section[1] = template_section[1] - ov_update;
  overlap_section[2] = t_stride; /* note : array_section[2] will be 1 */

  dalib_shift_section (overlap_section, t_1, t_N, ov_update);

  /* get in my_low and my_up my local and upper boundary */

  global_size [0] = t_1;
  global_size [1] = t_N;

  dalib_distribution_size (t_mapping, NId, NP, global_size, my_section);

  *schedule_id = dalib_new_schedule (NP, NP);

  /* compuate left and right neighbor positions in 1:NP coordinates */

  left_pid = NId - 1;
  if (left_pid < 1) left_pid = NP;

  right_pid = NId + 1;
  if (right_pid > NP) right_pid = 1;

  dalib_set_comm_parts (my_section, ov_update, left_side, right_side);

  if (ov_update > 0)

   { /* left overlap update, receive from left side, send to right side */

      dalib_intersect_range_slice (overlap_section, right_side,
                                   send_section);

      /* map send_section back into array coordinates */

      dalib_map1_section (template_section, array_section,
                          send_section, send_section);

      dalib_schedule_send_section (*schedule_id, right_pid, send_section);

      /*  receive left side from my left processor */
 
      dalib_intersect_range_slice (overlap_section, left_side,
                                   recv_section);

      /* map recv_section back into array coordinates */

      dalib_map1_section (template_section, array_section,
                          recv_section,     recv_section);

      /* maybe that I have to receive it from more than one processor */

      dalib_schedule_recv_section (*schedule_id, left_pid, recv_section);
      
    }

  else

   { /* send left part of my local part to the left proc.       */

      dalib_intersect_range_slice (overlap_section, left_side,
                                   send_section);
 
      /* map send_section back into array coordinates */

      dalib_map1_section (template_section, array_section,
                          send_section, send_section);

      /* send it to the left neighbor */

      dalib_schedule_send_section (*schedule_id, left_pid, send_section);
 
      /* recv data in my right overlap area  from right processor */
 
      dalib_intersect_range_slice (overlap_section, right_side,
                                   recv_section);

      dalib_map1_section (template_section, array_section,
                          recv_section,     recv_section);

      /* recv from the right neighbor */

      dalib_schedule_recv_section (*schedule_id, right_pid, recv_section);
 
   }
  
} /* dalib_ov_move */

/*******************************************************************
*                                                                  *
*   void output_section (id, msg, section)                         *
*                                                                  *
*******************************************************************/

void output_section (id, msg, section)

int id;
char msg[];
int section [];

{ printf ("%d: smove %s : %d:%d:%d\n", 
           id, msg, section[0], section[1], section[2]);
}

/*******************************************************************
*                                                                  *
*  dalib_smove                                                     *
*                                                                  *
*     map:   [s_low:s_up:s_str] -> [t_low:t_up:t_str]              *
*                                                                  *
*    s_low:s_up:s_str is mapped t_a * I + t_b in t_1:t_N           *
*                                                                  *
*******************************************************************/

void dalib_struct_move (schedule_id,

                target_NP, target_NId, target_mapping,
                target_base, target_stride,
                target_global, target_section,

                source_NP, source_NId, source_mapping,
                source_base, source_stride,
                source_global, source_section)

int *schedule_id;

int target_NP, target_NId;
int source_NP, source_NId;
int target_base, target_stride;
int source_base, source_stride;
DistDim source_mapping, target_mapping;
int source_global[];
int source_section[];
int target_global[];
int target_section[];


{ int new_source_section[3];
  int my_source_section[3];

  int new_target_section[3];
  int my_target_section[3];

  int send_section[3];
  int recv_section[3];

  int pid;

#ifdef DEBUG
  printf ("%d: struct_move, send (%d of %d), recv (%d of %d\n",
           pcb.i, source_NId, source_NP, target_NId, target_NP);
#endif

  /* compute the aligned parts of source section and target section */

  dalib_aligned_section (source_section, source_base, source_stride,
                         new_source_section);

  dalib_aligned_section (target_section, target_base, target_stride,
                         new_target_section);

  /* send to maximal target_NP, recv from maximal source_NP processors */

  *schedule_id = dalib_new_schedule (target_NP, source_NP);

  /*************************
  *                        *
  *  SEND  part            *
  *                        *
  *************************/

  /* compute my part of the source */

#ifdef DEBUG
  output_section (pcb.i, "new source section", new_source_section);
#endif

  dalib_distribution_range (source_mapping, source_NId, source_NP,
                            source_global, new_source_section,
                            my_source_section);

#ifdef DEBUG
  output_section (pcb.i, "my part of source_section", my_source_section);
#endif

  /* find the part of target that I am responsible for */

  dalib_map1_section (new_source_section, new_target_section,
                      my_source_section,  my_target_section);

#ifdef DEBUG
  output_section (pcb.i, "responsible for target_section", my_target_section);
#endif

  for (pid = 1; pid <= target_NP; pid++)

    { /* compute part of target from pid that I am responsible for */

      dalib_distribution_range (target_mapping, pid, target_NP, 
                                target_global, my_target_section,
                                recv_section);

#ifdef DEBUG
      output_section (pcb.i, "has to be received from me", recv_section);
#endif

      /* remap directly back into the old section coordinates */

      dalib_map1_section (new_target_section, source_section,
                          recv_section,       send_section       );

#ifdef DEBUG
      output_section (pcb.i, "so I have to send", send_section);
#endif

      /* to_low : to_up : to_str is section of source I have to send */

      dalib_schedule_send_section (*schedule_id, pid, send_section);

    }

  /*************************
  *                        *
  *  RECV  part            *
  *                        *
  *************************/

  /* compute my part of the target */

#ifdef DEBUG
  output_section (pcb.i, "new target section", new_target_section);
#endif

  dalib_distribution_range (target_mapping, target_NId, target_NP,
                            target_global, new_target_section,
                            my_target_section);

#ifdef DEBUG
  output_section (pcb.i, "my part of target section", my_target_section);
#endif

  /* find the part of source that I will need */

  dalib_map1_section (new_target_section, new_source_section,
                      my_target_section,  my_source_section);

#ifdef DEBUG
  output_section (pcb.i, "needed source section", my_source_section);
#endif

  /* my_low : my_up : my_str is section of source */

  for (pid = 1; pid <= source_NP; pid++)

    { /* compute part of source from pid that I will need */

      dalib_distribution_range (source_mapping, pid, source_NP,
                                source_global, my_source_section,
                                send_section);

#ifdef DEBUG
      output_section (pcb.i, "has to be sent to me", send_section);
#endif

      /* remap directily back in coordinates of target section

         new_source_section -> new_target_section -> target_section */

      dalib_map1_section (new_source_section, target_section,
                          send_section,       recv_section       );

      /* to_low : to_up : to_str is section of source I will receive */

#ifdef DEBUG
      output_section (pcb.i, "have I to receive", recv_section);
#endif

      dalib_schedule_recv_section (*schedule_id, pid, recv_section);

    }

# ifdef DEBUG 
     dalib_print_schedule (*schedule_id);
# endif

} /* dalib_struct_move */

/*******************************************************************
*                                                                  *
*  dalib_any_move                                                  *
*                                                                  *
*******************************************************************/

void dalib_any_move (schedule_id,

                target_NP, target_NId, target_mapping,
                target_base, target_stride,
                target_global, target_section,

                source_NP, source_NId, source_mapping,
                source_base, source_stride,
                source_global, source_section)

int *schedule_id;

int target_NP, target_NId;
int source_NP, source_NId;
int target_base, target_stride;
int source_base, source_stride;
DistDim source_mapping, target_mapping;
int source_global[];
int source_section[];
int target_global[];
int target_section[];

{ int dim_size;
  int i, j, k;
  int lb, ub, str;

  int *source_vals, *target_vals;
  int *source_owner, *target_owner;

  int *help;
  int is_new;

  int send_total;
  int send_size[MAXP];
  int send_offset[MAXP];
  int *send_indexes;

  int recv_total;
  int recv_size[MAXP];
  int recv_offset[MAXP];
  int *recv_indexes;

#ifdef DEBUG
  printf ("%d: dalib_any_move", pcb.i);
#endif

  dim_size = dalib_range_size (source_section[0], source_section[1],
                               source_section[2]);

  source_vals  = dalib_int_malloc (dim_size, "move_schedule, source vals");
  target_vals  = dalib_int_malloc (dim_size, "move_schedule, target vals");
  source_owner = dalib_int_malloc (dim_size, "move_schedule, source owner");
  target_owner = dalib_int_malloc (dim_size, "move_schedule, target owner");

  lb  = source_section[0];
  ub  = source_section[1];
  str = source_section[2];

  if (str > 0)
     for (i=lb, j=0; i<=ub; i+=str, j++) source_vals[j] = i;
   else
     for (i=lb, j=0; i>=ub; i+=str, j++) source_vals[j] = i;

  lb  = target_section[0];
  ub  = target_section[1];
  str = target_section[2];

  if (str > 0)
     for (i=lb, j=0; i<=ub; i+=str, j++) target_vals[j] = i;
   else
     for (i=lb, j=0; i>=ub; i+=str, j++) target_vals[j] = i;

  for (j=0; j<dim_size; j++)

    { source_owner[j] = 1; target_owner[j] = 1;
#ifdef DEBUG
      printf ("index %d : source val = %d, target val = %d\n",
               j+1, source_vals[j], target_vals[j]);
#endif 
    }


  is_new = 1;   /* makes sure that next call will not allocate memory */

  dalib_distribution_addresses (source_NP, source_mapping, source_base,
                                source_stride, source_global[0], 
                                source_global[1], dim_size,
                                &source_vals, &is_new, source_owner);

  is_new = 1;   /* makes sure that next call will not allocate memory */

  dalib_distribution_addresses (target_NP, target_mapping, target_base,
                                target_stride, target_global[0], 
                                target_global[1], dim_size,
                                &target_vals, &is_new, target_owner);

  /* count the number of values to send and to receive */

  send_total = 0; recv_total = 0;

  for (i=0; i<target_NP; i++) send_size[i] = 0;
  for (i=0; i<source_NP; i++) recv_size[i] = 0;

  for (j=0; j<dim_size; j++)

    { if (source_owner[j] == source_NId)

         { /* I have to send this values */

           send_size  [target_owner[j]-1]++;
           send_total ++;

         }

      if (target_owner[j] == target_NId)

         { /* I have to receive this value */

           recv_size [source_owner[j]-1]++;
           recv_total ++;

         }
    }

#ifdef DEBUG
  printf ("%d: send (total=%d) ", pcb.i, send_total);
  for (j=0; j<target_NP; j++)
    printf (" %d to %d,", send_size[j], j+1);
  printf ("\n");

  printf ("%d: recv (total=%d) ", pcb.i, recv_total);
  for (j=0; j<target_NP; j++)
    printf (" %d from %d,", recv_size[j], j+1);
  printf ("\n");
#endif

  /* now build the offsets */

  send_offset[0] = 0;

  for (j=1; j<target_NP; j++) 
     send_offset[j] = send_offset[j-1] + send_size[j-1];

  recv_offset[0] = 0;

  for (j=1; j<source_NP; j++) 
     recv_offset[j] = recv_offset[j-1] + recv_size[j-1];

  /* now sort the indexes for sending and receiving */

  send_indexes = dalib_int_malloc (send_total, "move_schedule send indexes");
  recv_indexes = dalib_int_malloc (recv_total, "move_schedule recv indexes");

  for (j=0; j<dim_size; j++)

    { if (source_owner[j] == source_NId)

         { int target = target_owner[j];

           send_indexes [send_offset[target-1]++] = source_vals[j];

         }

      if (target_owner[j] == target_NId)

         { int source = source_owner[j];

           recv_indexes [recv_offset[source-1]++] = target_vals[j];

         }
    }

#ifdef DEBUG
  printf ("print my schedule : \n");

  for (j=0, k=0; j<target_NP; j++)

    { printf ("send %d values to %d : ", send_size[j], j+1);

      for (i=0; i<send_size[j]; i++, k++)
         printf ("%d ", send_indexes[k]);

      printf ("\n");
    }

  for (j=0, k=0; j<source_NP; j++)

    { printf ("recv %d values from %d : ", recv_size[j], j+1);

      for (i=0; i<recv_size[j]; i++, k++)
         printf ("%d ", recv_indexes[k]);

      printf ("\n");

    }

#endif

  /* target_NP : number of processors I have to send to 
     source_NP : number of processors I have to recv from  */

  *schedule_id = dalib_new_schedule (target_NP, source_NP);

  /* the following call will take responsibility of memory needed
     for send_indexes and recv_indexes                            */

  dalib_set_any_schedule (*schedule_id,
                          send_size, send_indexes,
                          recv_size, recv_indexes);

#ifdef DEBUG
  dalib_print_schedule (*schedule_id);
#endif

  dalib_int_free (target_owner, dim_size);
  dalib_int_free (source_owner, dim_size);
  dalib_int_free (target_vals, dim_size);
  dalib_int_free (source_vals, dim_size);

} /* dalib_any_move */

/*******************************************************************
*                                                                  *
*  dalib_get_move_schedule                                         *
*                                                                  *
*     map:   [s_low:s_up:s_str] -> [t_low:t_up:t_str]              *
*                                                                  *
*    s_low:s_up:s_str is mapped t_a * I + t_b in t_1:t_N           *
*                                                                  *
*******************************************************************/

void dalib_get_move_schedule (schedule_id,

                target_NP, target_NId, target_mapping,
                target_base, target_stride,
                target_global, target_section,

                source_NP, source_NId, source_mapping,
                source_base, source_stride,
                source_global, source_section)

int *schedule_id;

int target_NP, target_NId;
int source_NP, source_NId;
int target_base, target_stride;
int source_base, source_stride;
DistDim source_mapping, target_mapping;
int source_global[];
int source_section[];
int target_global[];
int target_section[];

{ int source_kind, target_kind;
  int source_dim,  target_dim;
  int struct_source, struct_target;

  int block_size;

  /* serial, general block, block and cyclic(1) are structured */

  dalib_dim_mapping_info (source_mapping, &source_kind, &source_dim);
  dalib_dim_mapping_info (target_mapping, &target_kind, &target_dim);
  
  struct_source = 1;   /* assume that source dim is structured */
  struct_target = 1;   /* assume that target dim is structured */

  if (source_NP > 1)

   { switch (source_kind) {

    case kSERIAL_DIM    :
    case kBLOCK_DIM     :
    case kGEN_BLOCK_DIM : break;

    case kCYCLIC_DIM    : { dalib_dist_cyclic_info (source_mapping,
                                                    &block_size);
                            if (block_size != 1) struct_source = 0;
                            break;
                          }

    case kINDIRECT_DIM  :
    case kARBITRARY_DIM : struct_source = 0; break;

    default : printf ("illegal kind in structured move :%d\n", source_kind);
              dalib_internal_error ("SERIOUS");
              dalib_stop ();

     } /* switch */

  }  /* a single processor is always structured */

  if (target_NP > 1) 

   { switch (target_kind) {

    case kSERIAL_DIM    : 
    case kBLOCK_DIM     : 
    case kGEN_BLOCK_DIM : break;

    case kCYCLIC_DIM    : { dalib_dist_cyclic_info (target_mapping, 
                                                    &block_size);
                            if (block_size != 1) struct_target = 0;
                            break;
                          }

    case kINDIRECT_DIM  : 
    case kARBITRARY_DIM : struct_target = 0; break;

    default : printf ("illegal kind in structured move :%d\n", target_kind);
              dalib_internal_error ("SERIOUS");
              dalib_stop ();

     } /* switch */

  }  /* a single processor is always structured */

  if (struct_source && struct_target)

   { dalib_struct_move (schedule_id,

                target_NP, target_NId, target_mapping,
                target_base, target_stride,
                target_global, target_section,

                source_NP, source_NId, source_mapping,
                source_base, source_stride,
                source_global, source_section);
      return;
   } 

  dalib_any_move (schedule_id,

                target_NP, target_NId, target_mapping,
                target_base, target_stride,
                target_global, target_section,

                source_NP, source_NId, source_mapping,
                source_base, source_stride,
                source_global, source_section);

  /*
  dalib_internal_error ("SERIOUS RESTRICTION");
  dalib_stop ();
  */

} /* dalib_get_move_schedule */
