/*
 * example.c
 * joseph adler
 * august 1995
 *
 * Copyright (C) 1995 Massachusetts Institute of Technology
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation for any purpose is hereby granted without
 * fee, provided that the above copyright notice appear in all copies
 * and that both that copyright notice and this permission notice
 * appear in supporting documentation. The author makes no
 * representations about the suitability of this software for any
 * purpose. It is provided "as is" without express or implied
 * warranty.
 *
 * THE AUTHORS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
 * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR
 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * RCS $Id: example.c,v 1.12 1995/08/24 08:24:34 tuna Exp $
 */

/* This example program is designed to help guide the user through
 * programming a simple program with CRL. It is a simple matrix
 * multiply program, with some extra hooks for dividing work in some
 * reasonable fashion.
 *
 * Simple serial version, adopted from:
 *
 *  _Introduction to Algorithms_. Thomas H. Cormen, Charles
 *  E. Leiserson, and Ronald L. Rivest. MIT Press, 1990.
 *
 *  void mm(int a[N][N], int b[N][N], int c[N][N])
 *  {
 *    int i, j, k;
 *
 *    for (i=0; i<N; i++)
 *    {
 *      for (j=0; j<N; j++)
 *      {
 *        c[i][j] = 0;
 *        for (k=0; k<N; k++)
 *          c[i][j] = c[i][j] + (a[i][k] * b[k][j]);
 *      }
 *    }
 *  }
 *
 * Because the purpose of this example is to illustrate how to program
 * in CRL, the example is intentionally kept quite simple (e.g., the
 * two input matrices are hard coded; the particular work and data
 * decomposition used is not particularly efficient).
 */

/* We begin by #including "crl.h", #defining some constants, and
 * declaring the two input matrices.
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#if defined(CM5)
#include <cm/cmmd.h>
#elif defined(ALEWIFE)
#include <parallel.h>
#endif

#include "crl.h"

/* MASTER is the node responsible for creating the regions
 * used to represent the destination matrix (c)
 */
#define MASTER (0)

/* N is the size of the (square) matrices used
 */
#define N (5)

/* IntMatrix is a typedef for an N by N matrix of int elements
 */
typedef int IntMatrix[N][N];

/* RidMatrix is a typedef for an N by N matrix of rid_t elements
 * (an rid_t is a CRL region identifier)
 */
typedef rid_t RidMatrix[N][N];

/* the first input matrix (static; replicated on all processors)
 */
IntMatrix a = {{ 2,  0,  0,  0,  0},
               { 0,  2,  0,  0,  0},
               { 0,  0,  2,  0,  0},
               { 0,  0,  0,  2,  0},
               { 0,  0,  0,  0,  2}};

/* the second input matrix (static; replicated on all processors) 
 */
IntMatrix b = {{ 1,  2,  3,  4,  5},
               { 6,  7,  8,  9, 10},
               {11, 12, 13, 14, 15},
               {16, 17, 18, 19, 20},
               {21, 22, 23, 24, 25}};

/* Now we need to consider the question of how we divide the work
 * involved in matrix multiplication. Basically, we want the program
 * to proceed as follows:
 *
 * 1. The MASTER node creates the regions used to represent the
 *    destination matrix (c).
 *
 * 2. Each node does its share of the computation and stops at a
 *    barrier when done.
 *
 * 3. After nodes return from the barrier, all work is done. The
 *    MASTER node prints out the values found in the destination
 *    matrix and all nodes exit.
 *
 * To implement this, we create a matrix of region identifiers, each
 * of which will in turn refer to a region which contains the value
 * of that element of the destination matrix (c).
 */
RidMatrix c;

/* We divide work in this program so that each node is called upon to
 * perform approximately the same amount of work. The scheme which is
 * used is fairly simple: we color each square in the matrix with a
 * node index number, traversing the n by n matrix from left to right,
 * and top to bottom, counting off node indices in ascending order. For
 * example, with seven nodes and a 4 by 4 matrix, assignments would
 * look like:
 *
 *  0  1  2  3
 *  4  5  6  0
 *  1  2  3  4
 *  5  6  0  1
 */

/* If we're using the TCP/UNIX version of CRL, the main procedure can
 * be found in ../pvm.common/pvm_app.c. This procedure takes care of
 * setting up a pvm group, and spawning processes. It then calls
 * main2(). Please see this code for more information on how to set up
 * a program to run using pvm.
 *
 * If we're running CRL on any other platform, then we need to include
 * a main procedure that calls main2().
 */

extern int main2(int, char **);

#if defined(TCPUNIX)
extern char *GROUP;
#else
int main(int argc, char **argv)
{
#if defined(CM5)
  CMMD_set_io_mode(0, CMMD_independent);
  CMMD_set_io_mode(1, CMMD_independent);
  CMMD_set_io_mode(2, CMMD_independent);
#endif

#if defined(ALEWIFE)
  do_in_parallel(main2, argc, argv);
#else 
  main2(argc, argv);
#endif

  return 0;
}
#endif

int main2(int argc, char **argv)
{
  int  index;
  int  row, col;
  int  k;
  int *tmp;

  /* initialize CRL
   */
#if defined(TCPUNIX)
  crl_init(GROUP);
#else
  crl_init();
#endif

  if (crl_self_addr == MASTER)
  {
    /* on the MASTER node, create all the regions used to represent
     * the destination matrix
     */
    for (row=0; row<N; row++)
      for (col=0; col<N; col++)
        c[row][col] = rgn_create(sizeof(int));

    /* broadcast this set of region identifiers to all other nodes
     */
    rgn_bcast_send(sizeof(RidMatrix), c);
  }
  else
  {
    /* receive the set of region identifiers broadcast by the MASTER
     * node
     */
    rgn_bcast_recv(sizeof(RidMatrix), c);
  }

  /* work distribution algorithm:
   *
   * Each node then computes the value of all destination matrix
   * elements where the index of the element in the matrix (numbering
   * from left to right, top to bottom) modulo the number of nodes is
   * equal to the local node index.
   *
   * that is, if
   *
   *  index = (N * row) + col
   *
   * each node computes all destination matrix elements for which
   *
   *  (index % crl_num_nodes) == crl_self_addr
   */

  for (index=crl_self_addr; index<(N*N); index+=crl_num_nodes)
  {
    row = index / N;
    col = index % N;

    printf("node %d computing c[%d][%d]\n", crl_self_addr, row, col);

    /* now, map the region representing the current destination matrix
     * element and initiate a write so we can write to it
     */
    tmp = rgn_map(c[row][col]);
    rgn_start_write(tmp);

    /* compute the value of this element of the destination matrix
     */
    *tmp = 0;
    for (k=0; k<N; k++)
      *tmp += (a[row][k] * b[k][col]);

    /* since we're done with this element of the destination matrix,
     * terminate the write operation and unmap the region
     */
    rgn_end_write(tmp);
    rgn_unmap(tmp);
  }

  /* wait for all nodes to finish computing
   */
  rgn_barrier();
  
  /* MASTER node prints out the destination matrix
   */
  if (crl_self_addr == MASTER)
  {
    printf("Final matrix:\n");
    for (row=0; row<N; row++)
    {
      for (col=0; col<N; col++)
      {
        /* map the region representing the current element and
         * initiate a read operation
         */
        tmp = (int *) rgn_map(c[row][col]);
        rgn_start_read(tmp);

        printf("%6d ", *tmp);

        /* terminate the read operation and unmap the region
         */
        rgn_end_read(tmp);
        rgn_unmap(tmp);
      }

      printf("\n");
    }
  }

#if defined(TCPUNIX)
  crl_exit();
#endif
}
