// Program : Matrix Multiplication, fine-grain access with bulk transfer
// Author  : William W. Y. Liang
// Usage   : matmul [[-]size]
//           [-] cause the result to be printed out.
// Last Update : 12/05/96

#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#include "adsm.h"
#include "adsmutil.h"
#include "adsmtime.h"

// function prototype
void slave();
void master();

// sys info
int seqno=get_seqno();
int nhost=get_nhost();

// Synchronization
AdsmBarrier Bmatmul("matmul");

// local data
char *prog;
int num=20;
int show=0;

int main(int argc,char *argv[]) {
  prog=argv[0];

  // spawn child
  if (seqno==0) { // parent do this
    if (argc>1) num=atoi(argv[1]);
    if ((show=(num<0))) num=-num;

    cout<<nhost<<" host detected"<<endl;

    adsm_malloc("snum",sizeof(int),&num); // pass size of array

    master();
  } else { // child do jobs
    int *snum=(int*)adsm_malloc("snum",sizeof(int));
    adsm_refresh(snum);
    num=*snum;

    slave();
  }
}

void master() {
  // set initial vector (all 1's)
  int *initmp=new int[num],i;
  for (i=0; i<num; i++) initmp[i]=1;

  // intialize shared object
  int **a=new int*[num],**b=new int*[num];
  for (i=0; i<num; i++) {
    adsm_malloc_array("a",sizeof(int)*num,0,num,a,initmp); // row major
    adsm_malloc_array("b",sizeof(int)*num,0,num,b,initmp); // col major
  }

  // spawn children
  adsm_spawn(execname(prog),nhost); 

  // timing
  Bmatmul.barrier(nhost+1);
  Timing Tmatmul1("Including Prefetch");
  Timing Tmatmul2("Computing and Refresh");
  Tmatmul1.start(); // timing start
  Bmatmul.barrier(nhost+1);
  Tmatmul2.start(); // timing start
  Bmatmul.barrier(nhost+1);
  Tmatmul2.stop();  // timing end
  Tmatmul1.stop();  // timing end

  if (show) {
    cout<<"Final matrix:"<<endl;

    // allocate c
    int **c=new int*[num];
    adsm_malloc_array("c",sizeof(int)*num,num,c);

    adsm_refresh(AdsmBulkBegin);
    for (i=0; i<num; i++) adsm_refresh(c[i]);
    adsm_refresh(AdsmBulkEnd);

    for (int row=0; row<num; row++) {
      for (int col=0; col<num; col++) cout<<" "<<c[row][col];
      cout<<endl;
    }
  }
}

void slave() {
  int i,j,k;

  // task partition
  int len=(num+nhost-1)/nhost;
  int begin=len*(seqno-1);
  int end=begin+len;
  if (end>num) {
    end=num;
    len=num-begin;
  }

  cout<<seqno<<": my range is "<<begin<<" to "<<end<<endl;

  // allocate
  int **a=new int*[num],**b=new int*[num],**c=new int*[num];
  adsm_malloc_array("a",sizeof(int)*num,num,a);
  adsm_malloc_array("b",sizeof(int)*num,num,b);
  adsm_malloc_array("c",sizeof(int)*num,num,c);

  Bmatmul.barrier(nhost+1); // to prefetch

  // bulk prefetch
  adsm_prefresh(AdsmBulkBegin);
  adsm_prefresh_array(a+begin,len);
  adsm_prefresh_array(b,num);
  adsm_prefresh(AdsmBulkEnd);

  Bmatmul.barrier(nhost+1); // to compute

  for (i=begin; i<end; i++) {
    adsm_refresh(a[i]); // ensure the arrival of a[i]
    for (j=0; j<num; j++) {
      adsm_refresh(b[j]); // ensure the arrival of b[j]
      c[i][j]=0;
      for (k=0; k<num; k++)
        c[i][j]+=a[i][k]*b[j][k]; // b is allocated as column major
    }
    adsm_flush(c[i]); // flush the result of c[i]
  }
  
  Bmatmul.barrier(nhost+1); // finish computing
}
