// Program : Multiple Writer Protocol Version of Matrix Multiplication
// Author  : William W. Y. Liang
// Usage   : matmul2 [[-]size]

#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#include "adsm.h"
#include "adsmutil.h"
#include "adsmtime.h"

void slave();

int num=20;
int seqno,nhost;
AdsmBarrier Bmatmul2("matmul2");

inline int index(int r,int c) {
  return r*num+c;
}

// shared variable
int *snum;
int *a,*b,*c;

int main(int argc,char *argv[]) {
  // to obtain my sequence number
  seqno=get_seqno();
  nhost=get_nhost();

  // to pass size of array
  snum=(int*)adsm_malloc("snum",sizeof(int));

  // spawn child
  if (seqno==0) { // parent do this
    int show=0;
    if (argc>1) {
      num=atoi(argv[1]);
      if (num<0) {
        show=1;
        num=-num;
      }
    }

    // pass size of array
    *snum=num;
    adsm_flush(snum);

    cout<<nhost<<" host detected"<<endl;
    if (seqno==0) adsm_spawn(execname(argv[0]),nhost); 

    Bmatmul2.barrier(nhost+1);
    Timing Tmatmul2("matmul2");
    Bmatmul2.barrier(nhost+1);
    Tmatmul2.start(); // timing start
    Bmatmul2.barrier(nhost+1);
    Tmatmul2.stop(); // timing start

    if (show) { // show result
      c=(int*)adsm_malloc("c",sizeof(int)*num*num,AdsmDataMWriter); 
      cout<<"Final matrix:"<<endl;
      adsm_refresh(c);
      for (int row=0; row<num; row++) {
        for (int col=0; col<num; col++)
          cout<<" "<<c[row*num+col];
        cout<<endl;
      }
    }
  } else { // child do jobs
    adsm_refresh(snum);
    num=*snum;
    slave();
  }
}

void slave() {
  // allocate used location
  a=(int*)adsm_malloc("a",sizeof(int)*num*num,AdsmDataMWriter|AdsmDataUpdate);
  b=(int*)adsm_malloc("b",sizeof(int)*num*num,AdsmDataMWriter|AdsmDataUpdate);
  c=(int*)adsm_malloc("c",sizeof(int)*num*num,AdsmDataMWriter);

  // initialize a and b as I in parallel
  int i;
  for (i=seqno-1; i<num*num; i+=nhost)
    a[i]=b[i]=(i/num==i%num)?2:0;
  adsm_flush(a);
  adsm_flush(b);

  Bmatmul2.barrier(nhost+1); // init done 

  // collect multiple write result 
  adsm_refresh(AdsmBulkBegin);
  adsm_refresh(a);
  adsm_refresh(b);
  adsm_refresh(AdsmBulkEnd);

  Bmatmul2.barrier(nhost+1); // to compute

  // compute, statically partitioned
  for (i=seqno-1; i<num*num; i+=nhost) { 
    int row=i/num,col=i%num;
    c[i]=0;
    for (int k=0; k<num; k++)
      c[i]+=a[index(row,k)]*b[index(k,col)];
  }
  adsm_flush(c); // note: multiple writer protocol, only write part of c
  
  Bmatmul2.barrier(nhost+1); // computation done and to collect write result 
}
