#ifdef PETSC_RCS_HEADER
static char vcid[] = "$Id: mpirowbs.c,v 1.99.1.84 1998/04/27 03:52:57 curfman Exp $";
#endif

#if defined(HAVE_BLOCKSOLVE) && !defined(USE_PETSC_COMPLEX)
#include "pinclude/pviewer.h"
#include "src/mat/impls/rowbs/mpi/mpirowbs.h"
#include "src/vec/vecimpl.h"

#define CHUNCKSIZE_LOCAL   10


#undef __FUNC__  
#define __FUNC__ "MatFreeRowbs_Private"
static int MatFreeRowbs_Private(Mat A,int n,int *i,Scalar *v)
{
  PetscFunctionBegin;
  if (v) {
#if defined(USE_PETSC_LOG)
    int len = -n*(sizeof(int)+sizeof(Scalar));
#endif
    PetscFree(v);
    PLogObjectMemory(A,len);
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatMallocRowbs_Private"
static int MatMallocRowbs_Private(Mat A,int n,int **i,Scalar **v)
{
  int len;

  PetscFunctionBegin;
  if (n == 0) {
    *i = 0; *v = 0;
  } else {
    len = n*(sizeof(int) + sizeof(Scalar));
    *v = (Scalar *) PetscMalloc(len); CHKPTRQ(*v);
    PLogObjectMemory(A,len);
    *i = (int *)(*v + n);
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatScale_MPIRowbs"
int MatScale_MPIRowbs(Scalar *alphain,Mat inA)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) inA->data;
  BSspmat      *A = a->A;
  BSsprow      *vs;
  Scalar       *ap,alpha = *alphain;
  int          i,m = a->m,nrow,j;

  PetscFunctionBegin;
  for ( i=0; i<m; i++ ) {
    vs   = A->rows[i];
    nrow = vs->length;
    ap   = vs->nz;
    for ( j=0; j<nrow; j++ ) {
      ap[j] *= alpha;
    }
  }
  PLogFlops(a->nz);
  PetscFunctionReturn(0);
}

/* ----------------------------------------------------------------- */
#undef __FUNC__  
#define __FUNC__ "MatCreateMPIRowbs_local"
static int MatCreateMPIRowbs_local(Mat A,int nz,int *nnz)
{
  Mat_MPIRowbs *bsif = (Mat_MPIRowbs *) A->data;
  int          ierr, i, len, nzalloc = 0, m = bsif->m;
  BSspmat      *bsmat;
  BSsprow      *vs;

  PetscFunctionBegin;
  if (!nnz) {
    if (nz == PETSC_DEFAULT) nz = 5;
    if (nz <= 0)             nz = 1;
    nzalloc = 1;
    nnz     = (int *) PetscMalloc( (m+1)*sizeof(int) ); CHKPTRQ(nnz);
    for ( i=0; i<m; i++ ) nnz[i] = nz;
    nz      = nz*m;
  } else {
    nz = 0;
    for ( i=0; i<m; i++ ) {
      if (nnz[i] <= 0) nnz[i] = 1;
      nz += nnz[i];
    }
  }

  /* Allocate BlockSolve matrix context */
  bsif->A                = bsmat = PetscNew(BSspmat); CHKPTRQ(bsmat);
  BSset_mat_icc_storage(bsmat,PETSC_FALSE);
  BSset_mat_symmetric(bsmat,PETSC_FALSE);
  len                    = m*(sizeof(BSsprow *) + sizeof(BSsprow)) + 1;
  bsmat->rows            = (BSsprow **) PetscMalloc( len ); CHKPTRQ(bsmat->rows);
  bsmat->num_rows        = m;
  bsmat->global_num_rows = bsif->M;
  bsmat->map             = bsif->bsmap;
  vs                     = (BSsprow *) (bsmat->rows + m);
  for (i=0; i<m; i++) {
    bsmat->rows[i]  = vs;
    bsif->imax[i]   = nnz[i];
    vs->diag_ind    = -1;
    if (nnz[i] > 0) {
      ierr = MatMallocRowbs_Private(A,nnz[i],&(vs->col),&(vs->nz));CHKERRQ(ierr);
    } else {
      vs->col = 0; vs->nz = 0;
    }
    /* put zero on diagonal */
    /*vs->length	    = 1;
    vs->col[0]      = i + bsif->rstart;
    vs->nz[0]       = 0.0;*/
    vs->length = 0;
    vs++; 
  }
  PLogObjectMemory(A,sizeof(BSspmat) + len);
  bsif->nz	     = 0;
  bsif->maxnz	     = nz;
  bsif->sorted       = 0;
  bsif->roworiented  = 1;
  bsif->nonew        = 0;

  if (nzalloc) PetscFree(nnz);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatSetValues_MPIRowbs_local"
static int MatSetValues_MPIRowbs_local(Mat AA,int m,int *im,int n,int *in,Scalar *v,
                                       InsertMode addv)
{
  Mat_MPIRowbs *mat = (Mat_MPIRowbs *) AA->data;
  BSspmat      *A = mat->A;
  BSsprow      *vs;
  int          *rp,k,a,b,t,ii,row,nrow,i,col,l,rmax, ierr;
  int          *imax = mat->imax, nonew = mat->nonew, sorted = mat->sorted;
  Scalar       *ap, value;

  PetscFunctionBegin;
  for ( k=0; k<m; k++ ) { /* loop over added rows */
    row = im[k];
    if (row < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
    if (row >= mat->m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
    vs   = A->rows[row];
    ap   = vs->nz; rp = vs->col;
    rmax = imax[row]; nrow = vs->length;
    a    = 0;
    for ( l=0; l<n; l++ ) { /* loop over added columns */
      if (in[l] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative col");
      if (in[l] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
      col = in[l]; value = *v++;
      if (!sorted) a = 0; b = nrow;
      while (b-a > 5) {
        t = (b+a)/2;
        if (rp[t] > col) b = t;
        else             a = t;
      }
      for ( i=a; i<b; i++ ) {
        if (rp[i] > col) break;
        if (rp[i] == col) {
          if (addv == ADD_VALUES) ap[i] += value;
          else                    ap[i] = value;
          goto noinsert;
        }
      }
      if (nonew) goto noinsert;
      if (nrow >= rmax) {
        /* there is no extra room in row, therefore enlarge */
        int      *itemp;
        register int *iout, *iin = vs->col;
        register Scalar *vout, *vin = vs->nz;
        Scalar   *vtemp;

        /* malloc new storage space */
        imax[row] += CHUNCKSIZE_LOCAL;
        ierr = MatMallocRowbs_Private(AA,imax[row],&itemp,&vtemp);CHKERRQ(ierr);
        vout = vtemp; iout = itemp;
        for (ii=0; ii<i; ii++) {
          vout[ii] = vin[ii];
          iout[ii] = iin[ii];
        }
        vout[i] = value;
        iout[i] = col;
        for (ii=i+1; ii<=nrow; ii++) {
          vout[ii] = vin[ii-1];
          iout[ii] = iin[ii-1];
        }
        /* free old row storage */
        if (rmax > 0) {
          ierr = MatFreeRowbs_Private(AA,rmax,vs->col,vs->nz); CHKERRQ(ierr);
        }
        vs->col           =  iout; vs->nz = vout;
        rmax              =  imax[row];
        mat->maxnz        += CHUNCKSIZE_LOCAL;
        mat->reallocs++;
      } else {
        /* shift higher columns over to make room for newie */
        for ( ii=nrow-1; ii>=i; ii-- ) {
          rp[ii+1] = rp[ii];
          ap[ii+1] = ap[ii];
        }
        rp[i] = col;
        ap[i] = value;
      }
      nrow++;
      mat->nz++;
      AA->same_nonzero = PETSC_FALSE;
      noinsert:;
      a = i + 1;
    }
    vs->length = nrow;
  }
  PetscFunctionReturn(0);
}


#undef __FUNC__  
#define __FUNC__ "MatAssemblyBegin_MPIRowbs_local"
static int MatAssemblyBegin_MPIRowbs_local(Mat A,MatAssemblyType mode)
{ 
  PetscFunctionBegin;
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatAssemblyEnd_MPIRowbs_local"
static int MatAssemblyEnd_MPIRowbs_local(Mat AA,MatAssemblyType mode)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) AA->data;
  BSspmat      *A = a->A;
  BSsprow      *vs;
  int          i, j, rstart = a->rstart;

  PetscFunctionBegin;
  if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);

  /* Mark location of diagonal */
  for ( i=0; i<a->m; i++ ) {
    vs = A->rows[i];
    for ( j=0; j<vs->length; j++ ) {
      if (vs->col[j] == i + rstart) {
        vs->diag_ind = j;
        break;
      }
    }
    if (vs->diag_ind == -1) { 
      SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"no diagonal entry");
    }
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatZeroRows_MPIRowbs_local"
static int MatZeroRows_MPIRowbs_local(Mat A,IS is,Scalar *diag)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) A->data;
  BSspmat      *l = a->A;
  int          i, ierr, N, *rz, m = a->m - 1;

  PetscFunctionBegin;
  ierr = ISGetSize(is,&N); CHKERRQ(ierr);
  ierr = ISGetIndices(is,&rz); CHKERRQ(ierr);
  if (diag) {
    for ( i=0; i<N; i++ ) {
      if (rz[i] < 0 || rz[i] > m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Out of range");
      if (l->rows[rz[i]]->length > 0) { /* in case row was completely empty */
        l->rows[rz[i]]->length = 1;
        l->rows[rz[i]]->nz[0]  = *diag;
        l->rows[rz[i]]->col[0] = a->rstart + rz[i];
      } else {
        ierr = MatSetValues(A,1,&rz[i],1,&rz[i],diag,INSERT_VALUES);CHKERRQ(ierr);
      }
    }
  } else {
    for ( i=0; i<N; i++ ) {
      if (rz[i] < 0 || rz[i] > m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Out of range");
      l->rows[rz[i]]->length = 0;
    }
  }
  A->same_nonzero = PETSC_FALSE;
  ISRestoreIndices(is,&rz);
  ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatNorm_MPIRowbs_local"
static int MatNorm_MPIRowbs_local(Mat A,NormType type,double *norm)
{
  Mat_MPIRowbs *mat = (Mat_MPIRowbs *) A->data;
  BSsprow      *vs, **rs;
  Scalar       *xv;
  double       sum = 0.0;
  int          *xi, nz, i, j;

  PetscFunctionBegin;
  rs = mat->A->rows;
  if (type == NORM_FROBENIUS) {
    for (i=0; i<mat->m; i++ ) {
      vs = *rs++;
      nz = vs->length;
      xv = vs->nz;
      while (nz--) {
#if defined(USE_PETSC_COMPLEX)
        sum += real(conj(*xv)*(*xv)); xv++;
#else
        sum += (*xv)*(*xv); xv++;
#endif
      }
    }
    *norm = sqrt(sum);
  } else if (type == NORM_1) { /* max column norm */
    double *tmp;
    tmp = (double *) PetscMalloc( mat->n*sizeof(double) ); CHKPTRQ(tmp);
    PetscMemzero(tmp,mat->n*sizeof(double));
    *norm = 0.0;
    for (i=0; i<mat->m; i++) {
      vs = *rs++;
      nz = vs->length;
      xi = vs->col;
      xv = vs->nz;
      while (nz--) {
        tmp[*xi] += PetscAbsScalar(*xv); 
        xi++; xv++;
      }
    }
    for ( j=0; j<mat->n; j++ ) {
      if (tmp[j] > *norm) *norm = tmp[j];
    }
    PetscFree(tmp);
  } else if (type == NORM_INFINITY) { /* max row norm */
    *norm = 0.0;
    for ( i=0; i<mat->m; i++ ) {
      vs = *rs++;
      nz = vs->length;
      xv = vs->nz;
      sum = 0.0;
      while (nz--) {
        sum += PetscAbsScalar(*xv); xv++;
      }
      if (sum > *norm) *norm = sum;
    }
  } else {
    SETERRQ(PETSC_ERR_SUP,0,"No support for the two norm");
  }
  PetscFunctionReturn(0);
}

/* ----------------------------------------------------------------- */

#undef __FUNC__  
#define __FUNC__ "MatSetValues_MPIRowbs"
int MatSetValues_MPIRowbs(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode av)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) A->data;
  int          ierr, i, j, row, col, rstart = a->rstart, rend = a->rend;
  int          roworiented = a->roworiented;

  PetscFunctionBegin;
  /* Note:  There's no need to "unscale" the matrix, since scaling is
     confined to a->pA, and we're working with a->A here */
  for ( i=0; i<m; i++ ) {
    if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
    if (im[i] >= a->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
    if (im[i] >= rstart && im[i] < rend) {
      row = im[i] - rstart;
      for ( j=0; j<n; j++ ) {
        if (in[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
        if (in[j] >= a->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
        if (in[j] >= 0 && in[j] < a->N){
          col = in[j];
          if (roworiented) {
            ierr = MatSetValues_MPIRowbs_local(A,1,&row,1,&col,v+i*n+j,av);CHKERRQ(ierr);
          } else {
            ierr = MatSetValues_MPIRowbs_local(A,1,&row,1,&col,v+i+j*m,av);CHKERRQ(ierr);
          }
        } else {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid column");}
      }
    } else {
      if (roworiented) {
        ierr = StashValues_Private(&a->stash,im[i],n,in,v+i*n,av);CHKERRQ(ierr);
      } else {
        row = im[i];
        for ( j=0; j<n; j++ ) {
          ierr = StashValues_Private(&a->stash,row,1,in+j,v+i+j*m,av);CHKERRQ(ierr);
        }
      }
    }
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatAssemblyBegin_MPIRowbs"
int MatAssemblyBegin_MPIRowbs(Mat mat,MatAssemblyType mode)
{ 
  Mat_MPIRowbs  *a = (Mat_MPIRowbs *) mat->data;
  MPI_Comm      comm = mat->comm;
  int           size = a->size, *owners = a->rowners,st,rank = a->rank;
  int           *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
  int           tag = mat->tag, *owner,*starts,count,ierr,sn;
  MPI_Request   *send_waits,*recv_waits;
  InsertMode    addv;
  Scalar        *rvalues,*svalues;

  PetscFunctionBegin;
  /* StashInfo_Private(&a->stash); */
  /* Note:  There's no need to "unscale" the matrix, since scaling is
            confined to a->pA, and we're working with a->A here */

  /* make sure all processors are either in INSERTMODE or ADDMODE */
  ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
  if (addv == (ADD_VALUES|INSERT_VALUES)) {
    SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some procs inserted; others added");
  }
  mat->insertmode = addv; /* in case this processor had no cache */

  /*  first count number of contributors to each processor */
  nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
  PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
  owner = (int *) PetscMalloc( (a->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
  for ( i=0; i<a->stash.n; i++ ) {
    idx = a->stash.idx[i];
    for ( j=0; j<size; j++ ) {
      if (idx >= owners[j] && idx < owners[j+1]) {
        nprocs[j]++; procs[j] = 1; owner[i] = j; break;
      }
    }
  }
  nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];} 

  /* inform other processors of number of messages and max length*/
  work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
  ierr      = MPI_Allreduce(procs,work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
  nreceives = work[rank]; 
  ierr      = MPI_Allreduce(nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
  nmax      = work[rank];
  PetscFree(work);

  /* post receives: 
       1) each message will consist of ordered pairs 
     (global index,value) we store the global index as a double 
     to simplify the message passing. 
       2) since we don't know how long each individual message is we 
     allocate the largest needed buffer for each receive. Potentially 
     this is a lot of wasted space.


       This could be done better.
  */
  rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
  recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
  for ( i=0; i<nreceives; i++ ) {
    ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
  }

  /* do sends:
      1) starts[i] gives the starting index in svalues for stuff going to 
         the ith processor
  */
  svalues = (Scalar *) PetscMalloc( 3*(a->stash.n+1)*sizeof(Scalar) );CHKPTRQ(svalues);
  send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
  starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
  starts[0] = 0; 
  for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  sn = a->stash.n;
  for ( i=0; i<sn; i++ ) {
    st            = 3*starts[owner[i]]++;
    svalues[st++] = (Scalar)  a->stash.idx[i];
    svalues[st++] = (Scalar)  a->stash.idy[i];
    svalues[st]   =  a->stash.array[i];
  }
  PetscFree(owner);
  starts[0] = 0;
  for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  count = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
    }
  }
  PetscFree(starts); PetscFree(nprocs);

  /* Free cache space */
  PLogInfo(mat,"MatAssemblyBegin_MPIRowbs:Number of off-processor values %d\n",a->stash.n);
  ierr = StashDestroy_Private(&a->stash); CHKERRQ(ierr);

  a->svalues    = svalues;    a->rvalues = rvalues;
  a->nsends     = nsends;     a->nrecvs = nreceives;
  a->send_waits = send_waits; a->recv_waits = recv_waits;
  a->rmax       = nmax;

  PetscFunctionReturn(0);
}

#include "viewer.h"
#include "sys.h"

#undef __FUNC__  
#define __FUNC__ "MatView_MPIRowbs_ASCII_Base_Private"
static int MatView_MPIRowbs_ASCII_Base_Private(Mat_MPIRowbs *a,FILE *fd,int format)
{
  BSspmat *A = a->A;
  BSsprow **rs = A->rows;
  int     i, j;

  PetscFunctionBegin;
  if (format == VIEWER_FORMAT_ASCII_COMMON) {
    for ( i=0; i<A->num_rows; i++ ) {
      fprintf(fd,"row %d:",i+a->rstart);
      for (j=0; j<rs[i]->length; j++) {
        if (rs[i]->nz[j]) fprintf(fd," %d %g ", rs[i]->col[j], rs[i]->nz[j]);
      }
      fprintf(fd,"\n");
    }
  } else {
    fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
            a->rank,a->m,a->rstart,a->rend,a->n,0,a->N);
    for ( i=0; i<A->num_rows; i++ ) {
      fprintf(fd,"row %d:",i+a->rstart);
      for (j=0; j<rs[i]->length; j++) {
        fprintf(fd," %d %g ", rs[i]->col[j], rs[i]->nz[j]);
      }
      fprintf(fd,"\n");
    }
  }
  fflush(fd);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatView_MPIRowbs_ASCII"
static int MatView_MPIRowbs_ASCII(Mat mat,Viewer viewer)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  int          ierr, format, i, rank, size, row;
  FILE         *fd;

  PetscFunctionBegin;
  ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
  ierr = ViewerGetFormat(viewer,&format); CHKERRQ(ierr);

  if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
    int ind_l, ind_g, clq_l, clq_g, color;
    ind_l = BSlocal_num_inodes(a->pA); CHKERRBS(0);
    ind_g = BSglobal_num_inodes(a->pA); CHKERRBS(0);
    clq_l = BSlocal_num_cliques(a->pA); CHKERRBS(0);
    clq_g = BSglobal_num_cliques(a->pA); CHKERRBS(0);
    color = BSnum_colors(a->pA); CHKERRBS(0);
    PetscFPrintf(mat->comm,fd,
     "  %d global inode(s), %d global clique(s), %d color(s)\n",ind_g,clq_g,color);
    PetscSequentialPhaseBegin(mat->comm,1);
    fprintf(fd,"    [%d] %d local inode(s), %d local clique(s)\n",a->rank,ind_l,clq_l);
    fflush(fd);
    PetscSequentialPhaseEnd(mat->comm,1);
  } else {
    ViewerType   vtype;
    ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
    if (vtype == ASCII_FILE_VIEWER) {
      PetscSequentialPhaseBegin(mat->comm,1);
      ierr = MatView_MPIRowbs_ASCII_Base_Private(a,fd,format); CHKERRQ(ierr);
      PetscSequentialPhaseEnd(mat->comm,1);
    } else {
      size = a->size; rank = a->rank;
      if (size == 1) {
        ierr = MatView_MPIRowbs_ASCII_Base_Private(a,fd,format); CHKERRQ(ierr);
      } else { /* Assemble the entire matrix onto first processor */
        Mat mat2;
        BSspmat *A = a->A;
        BSsprow **rs = A->rows;
        int     M = a->M, m = a->m;
        if (!rank) {
          ierr = MatCreateMPIRowbs(mat->comm,M,M,0,PETSC_NULL,PETSC_NULL,&mat2); CHKERRQ(ierr);
        } else {
          ierr = MatCreateMPIRowbs(mat->comm,0,M,0,PETSC_NULL,PETSC_NULL,&mat2); CHKERRQ(ierr);
        }
        PLogObjectParent(mat,mat2);
        A = a->A; rs = A->rows;
        if (a->mat_is_symmetric) {
          ierr = MatSetOption(mat2,MAT_SYMMETRIC); CHKERRQ(ierr);
        } else if (a->mat_is_structurally_symmetric) {
          ierr = MatSetOption(mat2,MAT_STRUCTURALLY_SYMMETRIC); CHKERRQ(ierr);
        }
        row = a->rstart;
        for ( i=0; i<m; i++ ) {
	  ierr = MatSetValues(mat2,1,&row,rs[i]->length,rs[i]->col,rs[i]->nz,
                 INSERT_VALUES); CHKERRQ(ierr);
          row++;
        }
        /* Note that we do only flush assembly since otherwise BlockSolve chokes,
           as it doesn't support matrices of dimension 0 */
        ierr = MatAssemblyBegin(mat2,MAT_FLUSH_ASSEMBLY); CHKERRQ(ierr);
        ierr = MatAssemblyEnd(mat2,MAT_FLUSH_ASSEMBLY); CHKERRQ(ierr);
        if (!rank) {
          ierr = MatView_MPIRowbs_ASCII_Base_Private((Mat_MPIRowbs*)(mat2->data),
                 fd,format); CHKERRQ(ierr);
        }
        ierr = MatDestroy(mat2); CHKERRQ(ierr);
      }
    }
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatView_MPIRowbs_Binary"
static int MatView_MPIRowbs_Binary(Mat mat,Viewer viewer)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  int          ierr,i,M,m,rank,size,*sbuff,*rowlengths;
  int          *recvcts,*recvdisp,fd,*cols,maxnz,nz,j;
  BSspmat      *A = a->A;
  BSsprow      **rs = A->rows;
  MPI_Comm     comm = mat->comm;
  MPI_Status   status;
  Scalar       *vals;
  MatInfo      info;

  PetscFunctionBegin;
  MPI_Comm_size(comm,&size);
  MPI_Comm_rank(comm,&rank);

  M = a->M; m = a->m;
  /* First gather together on the first processor the lengths of 
     each row, and write them out to the file */
  sbuff = (int*) PetscMalloc( m*sizeof(int) ); CHKPTRQ(sbuff);
  for ( i=0; i<A->num_rows; i++ ) {
    sbuff[i] = rs[i]->length;
  }
  ierr = MatGetInfo(mat,MAT_GLOBAL_SUM,&info); CHKERRQ(ierr);
  if (!rank) {
    ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
    rowlengths = (int*) PetscMalloc( (4+M)*sizeof(int) ); CHKPTRQ(rowlengths);
    recvcts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(recvcts);
    recvdisp = a->rowners;
    for ( i=0; i<size; i++ ) {
      recvcts[i] = recvdisp[i+1] - recvdisp[i];
    }
    /* first four elements of rowlength are the header */
    rowlengths[0] = mat->cookie;
    rowlengths[1] = a->M;
    rowlengths[2] = a->N;
    rowlengths[3] = (int)info.nz_used;
    ierr = MPI_Gatherv(sbuff,m,MPI_INT,rowlengths+4,recvcts,recvdisp,MPI_INT,0,comm);CHKERRQ(ierr);
    PetscFree(sbuff);
    ierr = PetscBinaryWrite(fd,rowlengths,4+M,PETSC_INT,0); CHKERRQ(ierr);
    /* count the number of nonzeros on each processor */
    PetscMemzero(recvcts,size*sizeof(int));
    for ( i=0; i<size; i++ ) {
      for ( j=recvdisp[i]; j<recvdisp[i+1]; j++ ) {
        recvcts[i] += rowlengths[j+3];
      }
    }
    /* allocate buffer long enough to hold largest one */
    maxnz = 0;
    for ( i=0; i<size; i++ ) {
      maxnz = PetscMax(maxnz,recvcts[i]);
    }
    PetscFree(rowlengths); PetscFree(recvcts);
    cols = (int*) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);

    /* binary store column indices for 0th processor */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<rs[i]->length; j++) {
        cols[nz++] = rs[i]->col[j];
      }
    }
    ierr = PetscBinaryWrite(fd,cols,nz,PETSC_INT,0); CHKERRQ(ierr);

    /* receive and store column indices for all other processors */
    for ( i=1; i<size; i++ ) {
      /* should tell processor that I am now ready and to begin the send */
      ierr = MPI_Recv(cols,maxnz,MPI_INT,i,mat->tag,comm,&status);CHKERRQ(ierr);
      ierr = MPI_Get_count(&status,MPI_INT,&nz);CHKERRQ(ierr);
      ierr = PetscBinaryWrite(fd,cols,nz,PETSC_INT,0); CHKERRQ(ierr);
    }
    PetscFree(cols);
    vals = (Scalar*) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);

    /* binary store values for 0th processor */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<rs[i]->length; j++) {
        vals[nz++] = rs[i]->nz[j];
      }
    }
    ierr = PetscBinaryWrite(fd,vals,nz,PETSC_SCALAR,0); CHKERRQ(ierr);

    /* receive and store nonzeros for all other processors */
    for ( i=1; i<size; i++ ) {
      /* should tell processor that I am now ready and to begin the send */
      ierr = MPI_Recv(vals,maxnz,MPIU_SCALAR,i,mat->tag,comm,&status);CHKERRQ(ierr);
      ierr = MPI_Get_count(&status,MPIU_SCALAR,&nz);CHKERRQ(ierr);
      ierr = PetscBinaryWrite(fd,vals,nz,PETSC_SCALAR,0);CHKERRQ(ierr);
    }
    PetscFree(vals);
  } else {
    ierr = MPI_Gatherv(sbuff,m,MPI_INT,0,0,0,MPI_INT,0,comm);CHKERRQ(ierr);
    PetscFree(sbuff);

    /* count local nonzeros */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<rs[i]->length; j++) {
        nz++;
      }
    }
    /* copy into buffer column indices */
    cols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(cols);
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<rs[i]->length; j++) {
        cols[nz++] = rs[i]->col[j];
      }
    }
    /* send */  /* should wait until processor zero tells me to go */
    ierr = MPI_Send(cols,nz,MPI_INT,0,mat->tag,comm);CHKERRQ(ierr);
    PetscFree(cols);

    /* copy into buffer column values */
    vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<rs[i]->length; j++) {
        vals[nz++] = rs[i]->nz[j];
      }
    }
    /* send */  /* should wait until processor zero tells me to go */
    ierr = MPI_Send(vals,nz,MPIU_SCALAR,0,mat->tag,comm);CHKERRQ(ierr);
    PetscFree(vals);
  }

  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatView_MPIRowbs"
int MatView_MPIRowbs(Mat mat,Viewer viewer)
{
  Mat_MPIRowbs *bsif = (Mat_MPIRowbs *) mat->data;
  ViewerType   vtype;
  int          ierr;

  PetscFunctionBegin;
  if (!bsif->blocksolveassembly) {
    ierr = MatAssemblyEnd_MPIRowbs_ForBlockSolve(mat); CHKERRQ(ierr);
  }
  ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
  if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER) {
    ierr = MatView_MPIRowbs_ASCII(mat,viewer);CHKERRQ(ierr);
  } else if (vtype == BINARY_FILE_VIEWER) {
    ierr = MatView_MPIRowbs_Binary(mat,viewer);CHKERRQ(ierr);
  } else {
    SETERRQ(1,1,"Viewer type not supported by PETSc object");
  }
  PetscFunctionReturn(0);
}
  
#undef __FUNC__  
#define __FUNC__ "MatAssemblyEnd_MPIRowbs_MakeSymmetric"
static int MatAssemblyEnd_MPIRowbs_MakeSymmetric(Mat mat)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  BSspmat      *A = a->A;
  BSsprow      *vs;
  int          size,rank,M,rstart,tag,i,j,*rtable,*w1,*w2,*w3,*w4,len,proc,nrqs;
  int          msz,*pa,bsz,nrqr,**rbuf1,**sbuf1,**ptr,*tmp,*ctr,col,index,row;
  int          ctr_j,*sbuf1_j,k,ierr;
  Scalar       val=0.0;
  MPI_Comm     comm;
  MPI_Request  *s_waits1,*r_waits1;
  MPI_Status   *s_status,*r_status;

  PetscFunctionBegin;
  comm   = mat->comm;
  tag    = mat->tag;
  size   = a->size;
  rank   = a->rank;
  M      = a->M;
  rstart = a->rstart;

  rtable = (int *) PetscMalloc(M*sizeof(int)); CHKPTRQ(rtable);
  /* Create hash table for the mapping :row -> proc */
  for (i=0, j=0; i<size; i++) {
    len = a->rowners[i+1];  
    for (; j<len; j++) {
      rtable[j] = i;
    }
  }

  /* Evaluate communication - mesg to whom, length of mesg, and buffer space
     required. Based on this, buffers are allocated, and data copied into them. */
  w1   = (int *) PetscMalloc(size*4*sizeof(int)); CHKPTRQ(w1);/*  mesg size */
  w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
  w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
  w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
  PetscMemzero(w1,size*3*sizeof(int)); /* initialize work vector */

  for ( i=0;  i<a->m; i++ ) { 
    PetscMemzero(w4,size*sizeof(int)); /* initialize work vector */
    vs = A->rows[i];
    for ( j=0; j<vs->length; j++ ) {
      proc = rtable[vs->col[j]];
      w4[proc]++;
    }
    for ( j=0; j<size; j++ ) { 
      if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 
    }
  }
  
  nrqs     = 0;              /* number of outgoing messages */
  msz      = 0;              /* total mesg length (for all proc */
  w1[rank] = 0;              /* no mesg sent to itself */
  w3[rank] = 0;
  for (i=0; i<size; i++) {
    if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
  }
  /* pa - is list of processors to communicate with */
  pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa);
  for (i=0, j=0; i<size; i++) {
    if (w1[i]) {pa[j] = i; j++;}
  } 

  /* Each message would have a header = 1 + 2*(no of ROWS) + data */
  for (i=0; i<nrqs; i++) {
    j     = pa[i];
    w1[j] += w2[j] + 2*w3[j];   
    msz   += w1[j];  
  }
  
  /* Do a global reduction to determine how many messages to expect */
  {
    int *rw1, *rw2;
    rw1   = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
    rw2   = rw1+size;
    ierr   = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr);
    bsz   = rw1[rank];
    ierr  = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr);
    nrqr  = rw2[rank];
    PetscFree(rw1);
  }

  /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */
  len      = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int);
  rbuf1    = (int**) PetscMalloc(len);  CHKPTRQ(rbuf1);
  rbuf1[0] = (int *) (rbuf1 + nrqr);
  for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz;

  /* Post the receives */
  r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1);
  for ( i=0; i<nrqr; ++i ){
    ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i);CHKERRQ(ierr);
  }
  
  /* Allocate Memory for outgoing messages */
  len   = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
  sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1);
  ptr   = sbuf1 + size;     /* Pointers to the data in outgoing buffers */
  PetscMemzero(sbuf1,2*size*sizeof(int*));
  tmp   = (int *) (sbuf1 + 2*size);
  ctr   = tmp + msz;

  {
    int *iptr = tmp,ict  = 0;
    for ( i=0; i<nrqs; i++ ) {
      j        = pa[i];
      iptr    += ict;
      sbuf1[j] = iptr;
      ict      = w1[j];
    }
  }

  /* Form the outgoing messages */
  /* Clean up the header space */
  for ( i=0; i<nrqs; i++ ) {
    j           = pa[i];
    sbuf1[j][0] = 0;
    PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
    ptr[j]       = sbuf1[j] + 2*w3[j] + 1;
  }

  /* Parse the matrix and copy the data into sbuf1 */
  for ( i=0; i<a->m; i++ ) {
    PetscMemzero(ctr,size*sizeof(int));
    vs = A->rows[i];
    for ( j=0; j<vs->length; j++ ) {
      col  = vs->col[j];
      proc = rtable[col];
      if (proc != rank) { /* copy to the outgoing buffer */
        ctr[proc]++;
          *ptr[proc] = col;
          ptr[proc]++;
      } else {
        row = col - rstart;
        col = i + rstart;
        ierr = MatSetValues_MPIRowbs_local(mat,1,&row,1,&col,&val,ADD_VALUES);CHKERRQ(ierr);
      }
    }
    /* Update the headers for the current row */
    for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */
      if ((ctr_j = ctr[j])) {
        sbuf1_j        = sbuf1[j];
        k               = ++sbuf1_j[0];
        sbuf1_j[2*k]   = ctr_j;
        sbuf1_j[2*k-1] = i + rstart;
      }
    }
  }
   /* Check Validity of the outgoing messages */
  {
    int sum;
    for ( i=0 ; i<nrqs ; i++) {
      j = pa[i];
      if (w3[j] != sbuf1[j][0]) {SETERRQ(PETSC_ERR_PLIB,0,"Blew it! Header[1] mismatch!\n"); }
    }

    for ( i=0 ; i<nrqs ; i++) {
      j = pa[i];
      sum = 1;
      for (k = 1; k <= w3[j]; k++) sum += sbuf1[j][2*k]+2;
      if (sum != w1[j]) { SETERRQ(PETSC_ERR_PLIB,0,"Blew it! Header[2-n] mismatch!\n"); }
    }
  }
 
  /* Now post the sends */
  s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1);
  for ( i=0; i<nrqs; ++i ) {
    j    = pa[i];
    ierr = MPI_Isend(sbuf1[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i);CHKERRQ(ierr);
  }
   
  /* Receive messages*/
  r_status = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status);
  for ( i=0; i<nrqr; ++i ) {
    ierr = MPI_Waitany(nrqr, r_waits1, &index, r_status+i);CHKERRQ(ierr);
    /* Process the Message */
    {
      int    *rbuf1_i,n_row,ct1;

      rbuf1_i = rbuf1[index];
      n_row   = rbuf1_i[0];
      ct1     = 2*n_row+1;
      val     = 0.0;
      /* Optimise this later */
      for ( j=1; j<=n_row; j++ ) {
        col = rbuf1_i[2*j-1];
        for ( k=0; k<rbuf1_i[2*j]; k++,ct1++ ) {
          row = rbuf1_i[ct1] - rstart;
          ierr = MatSetValues_MPIRowbs_local(mat,1,&row,1,&col,&val,ADD_VALUES);CHKERRQ(ierr);
        }
      }
    }
  }

  s_status = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status);
  ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);

  PetscFree(rtable);
  PetscFree(w1);
  PetscFree(pa);
  PetscFree(rbuf1);
  PetscFree(sbuf1);
  PetscFree(r_waits1);
  PetscFree(s_waits1);
  PetscFree(r_status);
  PetscFree(s_status);
  PetscFunctionReturn(0);    
}

/*
     This does the BlockSolve portion of the matrix assembly.
   It is provided in a seperate routine so that users can
   operate on the matrix (using MatScale(), MatShift() etc.) after 
   the matrix has been assembled but before BlockSolve has sucked it
   in and devoured it.
*/
#undef __FUNC__  
#define __FUNC__ "MatAssemblyEnd_MPIRowbs_ForBlockSolve"
int MatAssemblyEnd_MPIRowbs_ForBlockSolve(Mat mat)
{ 
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  int          ierr,ldim,low,high,i;
  Scalar       *diag;

  PetscFunctionBegin;
  if ((mat->was_assembled) && (!mat->same_nonzero)) {  /* Free the old info */
    if (a->pA)       {BSfree_par_mat(a->pA);   CHKERRBS(0);}
    if (a->comm_pA)  {BSfree_comm(a->comm_pA); CHKERRBS(0);} 
  }

  if ((!mat->same_nonzero) || (!mat->was_assembled)) {
    /* Form permuted matrix for efficient parallel execution */
    a->pA = BSmain_perm(a->procinfo,a->A); CHKERRBS(0);
    /* Set up the communication */
    a->comm_pA = BSsetup_forward(a->pA,a->procinfo); CHKERRBS(0);
  } else {
    /* Repermute the matrix */
    BSmain_reperm(a->procinfo,a->A,a->pA); CHKERRBS(0);
  }

  /* Symmetrically scale the matrix by the diagonal */
  BSscale_diag(a->pA,a->pA->diag,a->procinfo); CHKERRBS(0);

  /* Store inverse of square root of permuted diagonal scaling matrix */
  ierr = VecGetLocalSize( a->diag, &ldim ); CHKERRQ(ierr);
  ierr = VecGetOwnershipRange( a->diag, &low, &high ); CHKERRQ(ierr);
  ierr = VecGetArray(a->diag,&diag); CHKERRQ(ierr);
  for (i=0; i<ldim; i++) {
    if (a->pA->scale_diag[i] != 0.0) {
      diag[i] = 1.0/sqrt(PetscAbsScalar(a->pA->scale_diag[i]));
    } else {
      diag[i] = 1.0;
    }   
  }
  a->blocksolveassembly = 1;
  mat->was_assembled    = PETSC_TRUE;
  mat->same_nonzero     = PETSC_TRUE;
  PLogInfo(mat,"MatAssemblyEnd_MPIRowbs_ForBlockSolve:Completed BlockSolve95 matrix assembly\n");
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatAssemblyEnd_MPIRowbs"
int MatAssemblyEnd_MPIRowbs(Mat mat,MatAssemblyType mode)
{ 
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  MPI_Status   *send_status,recv_status;
  int          imdex,nrecvs = a->nrecvs, count = nrecvs, i, n;
  int          row, col, ierr, rstart, nzcount;
  Scalar       *values, val;
  InsertMode   addv = mat->insertmode;

  PetscFunctionBegin;
  rstart = a->rstart;
  /*  wait on receives */
  while (count) {
    ierr = MPI_Waitany(nrecvs,a->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
    /* unpack receives into our local space */
    values = a->rvalues + 3*imdex*a->rmax;
    ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
    n = n/3;
    for ( i=0; i<n; i++ ) {
      row = (int) PetscReal(values[3*i]) - rstart;
      col = (int) PetscReal(values[3*i+1]);
      val = values[3*i+2];
      if (col >= 0 && col < a->N) {
        ierr = MatSetValues_MPIRowbs_local(mat,1,&row,1,&col,&val,addv);CHKERRQ(ierr);
      } else SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid column");
    }
    count--;
  }
  PetscFree(a->recv_waits); PetscFree(a->rvalues);
 
  /* wait on sends */
  if (a->nsends) {
    send_status = (MPI_Status *) PetscMalloc( a->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
    ierr        = MPI_Waitall(a->nsends,a->send_waits,send_status);CHKERRQ(ierr);
    PetscFree(send_status);
  }
  PetscFree(a->send_waits); PetscFree(a->svalues);
  nzcount = a->nz; /* This is the number of nonzeros entered by the user */
  /* BlockSolve requires that the matrix is structurally symmetric */
  if (mode == MAT_FINAL_ASSEMBLY && !a->mat_is_structurally_symmetric) {
    ierr = MatAssemblyEnd_MPIRowbs_MakeSymmetric(mat); CHKERRQ(ierr);
  }
  
  /* BlockSolve requires that all the diagonal elements are set */
  val  = 0.0;
  for ( i=0; i<a->m; i++ ) {
    row = i; col = i + rstart;
    ierr = MatSetValues_MPIRowbs_local(mat,1,&row,1,&col,&val,ADD_VALUES);CHKERRQ(ierr);
  }
  
  ierr = MatAssemblyBegin_MPIRowbs_local(mat,mode); CHKERRQ(ierr);
  ierr = MatAssemblyEnd_MPIRowbs_local(mat,mode); CHKERRQ(ierr);
  
  a->blocksolveassembly = 0;
  PLogInfo(mat,"MatAssemblyEnd_MPIRowbs:Matrix size: %d X %d; storage space: %d unneeded, %d used\n",
           a->m,a->n,a->maxnz-a->nz,a->nz);
  PLogInfo(mat,"MatAssemblyEnd_MPIRowbs: User entered %d nonzeros, PETSc added %d\n",
           nzcount,a->nz-nzcount);    
  PLogInfo(mat,"MatAssemblyEnd_MPIRowbs:Number of mallocs during MatSetValues is %d\n",
           a->reallocs);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatZeroEntries_MPIRowbs"
int MatZeroEntries_MPIRowbs(Mat mat)
{
  Mat_MPIRowbs *l = (Mat_MPIRowbs *) mat->data;
  BSspmat      *A = l->A;
  BSsprow      *vs;
  int          i, j;

  PetscFunctionBegin;
  for (i=0; i < l->m; i++) {
    vs = A->rows[i];
    for (j=0; j< vs->length; j++) vs->nz[j] = 0.0;
  }
  PetscFunctionReturn(0);
}

/* the code does not do the diagonal entries correctly unless the 
   matrix is square and the column and row owerships are identical.
   This is a BUG.
*/

#undef __FUNC__  
#define __FUNC__ "MatZeroRows_MPIRowbs"
int MatZeroRows_MPIRowbs(Mat A,IS is,Scalar *diag)
{
  Mat_MPIRowbs   *l = (Mat_MPIRowbs *) A->data;
  int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
  int            *procs,*nprocs,j,found,idx,nsends,*work;
  int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
  int            *rvalues,tag = A->tag,count,base,slen,n,*source;
  int            *lens,imdex,*lrows,*values;
  MPI_Comm       comm = A->comm;
  MPI_Request    *send_waits,*recv_waits;
  MPI_Status     recv_status,*send_status;
  IS             istmp;

  PetscFunctionBegin;
  ierr = ISGetSize(is,&N); CHKERRQ(ierr);
  ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);

  /*  first count number of contributors to each processor */
  nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
  PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
  owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
  for ( i=0; i<N; i++ ) {
    idx = rows[i];
    found = 0;
    for ( j=0; j<size; j++ ) {
      if (idx >= owners[j] && idx < owners[j+1]) {
        nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
      }
    }
    if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row out of range");
  }
  nsends = 0;  for ( i=0; i<size; i++ ) {nsends += procs[i];} 

  /* inform other processors of number of messages and max length*/
  work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
  ierr   = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
  nrecvs = work[rank]; 
  ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
  nmax   = work[rank];
  PetscFree(work);

  /* post receives:   */
  rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
  recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
  for ( i=0; i<nrecvs; i++ ) {
    ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
  }

  /* do sends:
      1) starts[i] gives the starting index in svalues for stuff going to 
         the ith processor
  */
  svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
  send_waits = (MPI_Request *)PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
  starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
  starts[0] = 0; 
  for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  for ( i=0; i<N; i++ ) {
    svalues[starts[owner[i]]++] = rows[i];
  }
  ISRestoreIndices(is,&rows);

  starts[0] = 0;
  for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  count = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
    }
  }
  PetscFree(starts);

  base = owners[rank];

  /*  wait on receives */
  lens = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
  source = lens + nrecvs;
  count = nrecvs; slen = 0;
  while (count) {
    ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
    /* unpack receives into our local space */
    ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
    source[imdex]  = recv_status.MPI_SOURCE;
    lens[imdex]    = n;
    slen           += n;
    count--;
  }
  PetscFree(recv_waits); 
  
  /* move the data into the send scatter */
  lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
  count = 0;
  for ( i=0; i<nrecvs; i++ ) {
    values = rvalues + i*nmax;
    for ( j=0; j<lens[i]; j++ ) {
      lrows[count++] = values[j] - base;
    }
  }
  PetscFree(rvalues); PetscFree(lens);
  PetscFree(owner); PetscFree(nprocs);
    
  /* actually zap the local rows */
  ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp); CHKERRQ(ierr);  
  PLogObjectParent(A,istmp);
  PetscFree(lrows);
  ierr = MatZeroRows_MPIRowbs_local(A,istmp,diag); CHKERRQ(ierr);
  ierr = ISDestroy(istmp); CHKERRQ(ierr);

  /* wait on sends */
  if (nsends) {
    send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
    ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
    PetscFree(send_status);
  }
  PetscFree(send_waits); PetscFree(svalues);

  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatNorm_MPIRowbs"
int MatNorm_MPIRowbs(Mat mat,NormType type,double *norm)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  BSsprow      *vs, **rs;
  Scalar       *xv;
  double       sum = 0.0;
  int          *xi, nz, i, j, ierr;

  PetscFunctionBegin;
  if (a->size == 1) {
    ierr = MatNorm_MPIRowbs_local(mat,type,norm); CHKERRQ(ierr);
  } else {
    rs = a->A->rows;
    if (type == NORM_FROBENIUS) {
      for (i=0; i<a->m; i++ ) {
        vs = *rs++;
        nz = vs->length;
        xv = vs->nz;
        while (nz--) {
#if defined(USE_PETSC_COMPLEX)
          sum += real(conj(*xv)*(*xv)); xv++;
#else
          sum += (*xv)*(*xv); xv++;
#endif
        }
      }
      ierr  = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
      *norm = sqrt(*norm);
    } else if (type == NORM_1) { /* max column norm */
      double *tmp, *tmp2;
      tmp = (double *) PetscMalloc( a->n*sizeof(double) ); CHKPTRQ(tmp);
      tmp2 = (double *) PetscMalloc( a->n*sizeof(double) ); CHKPTRQ(tmp2);
      PetscMemzero(tmp,a->n*sizeof(double));
      *norm = 0.0;
      for (i=0; i<a->m; i++) {
        vs = *rs++;
        nz = vs->length;
        xi = vs->col;
        xv = vs->nz;
        while (nz--) {
          tmp[*xi] += PetscAbsScalar(*xv); 
          xi++; xv++;
        }
      }
      ierr = MPI_Allreduce(tmp,tmp2,a->N,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
      for ( j=0; j<a->n; j++ ) {
        if (tmp2[j] > *norm) *norm = tmp2[j];
      }
      PetscFree(tmp); PetscFree(tmp2);
    } else if (type == NORM_INFINITY) { /* max row norm */
      double ntemp = 0.0;
      for ( i=0; i<a->m; i++ ) {
        vs = *rs++;
        nz = vs->length;
        xv = vs->nz;
        sum = 0.0;
        while (nz--) {
          sum += PetscAbsScalar(*xv); xv++;
        }
        if (sum > ntemp) ntemp = sum;
      }
      ierr = MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);CHKERRQ(ierr);
    } else {
      SETERRQ(PETSC_ERR_SUP,0,"No support for two norm");
    }
  }
  PetscFunctionReturn(0); 
}

#undef __FUNC__  
#define __FUNC__ "MatMult_MPIRowbs"
int MatMult_MPIRowbs(Mat mat,Vec xx,Vec yy)
{
  Mat_MPIRowbs *bsif = (Mat_MPIRowbs *) mat->data;
  BSprocinfo   *bspinfo = bsif->procinfo;
  Scalar       *xxa, *xworka, *yya;
  int          ierr;

  PetscFunctionBegin;
  if (!bsif->blocksolveassembly) {
    ierr = MatAssemblyEnd_MPIRowbs_ForBlockSolve(mat); CHKERRQ(ierr);
  }

  ierr = VecGetArray(yy,&yya); CHKERRQ(ierr);
  ierr = VecGetArray(xx,&xxa); CHKERRQ(ierr);

  /* Permute and apply diagonal scaling:  [ xwork = D^{1/2} * x ] */
  if (!bsif->vecs_permscale) {
    ierr = VecGetArray(bsif->xwork,&xworka); CHKERRQ(ierr);
    BSperm_dvec(xxa,xworka,bsif->pA->perm); CHKERRBS(0);
    ierr = VecPointwiseDivide(bsif->xwork,bsif->diag,xx); CHKERRQ(ierr);
  } 

  /* Do lower triangular multiplication:  [ y = L * xwork ] */
  if (bspinfo->single) {
    BSforward1( bsif->pA, xxa, yya, bsif->comm_pA, bspinfo );CHKERRBS(0);
  }  else {
    BSforward( bsif->pA, xxa, yya, bsif->comm_pA, bspinfo );CHKERRBS(0);
  }
  

  /* Do upper triangular multiplication:  [ y = y + L^{T} * xwork ] */
  if (bsif->mat_is_symmetric) {
    if (bspinfo->single){
      BSbackward1( bsif->pA, xxa, yya, bsif->comm_pA, bspinfo );CHKERRBS(0);
    } else {
      BSbackward( bsif->pA, xxa, yya, bsif->comm_pA, bspinfo );CHKERRBS(0);
    }
    
  }
  /* not needed for ILU version since forward does it all */

  /* Apply diagonal scaling to vector:  [  y = D^{1/2} * y ] */
  if (!bsif->vecs_permscale) {
    BSiperm_dvec(xworka,xxa,bsif->pA->perm); CHKERRBS(0);
    ierr = VecPointwiseDivide(yy,bsif->diag,bsif->xwork); CHKERRQ(ierr);
    BSiperm_dvec(xworka,yya,bsif->pA->perm); CHKERRBS(0);
  }
  PLogFlops(2*bsif->nz - bsif->m);

  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatMultAdd_MPIRowbs"
int MatMultAdd_MPIRowbs(Mat mat,Vec xx,Vec yy, Vec zz)
{
  int     ierr;
  Scalar  one = 1.0;

  PetscFunctionBegin;
  ierr = (*mat->ops->mult)(mat,xx,zz); CHKERRQ(ierr);
  ierr = VecAXPY(&one,yy,zz); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatGetInfo_MPIRowbs"
int MatGetInfo_MPIRowbs(Mat A,MatInfoType flag,MatInfo *info)
{
  Mat_MPIRowbs *mat = (Mat_MPIRowbs *) A->data;
  double       isend[5], irecv[5];
  int          ierr;

  PetscFunctionBegin;
  info->rows_global    = (double)mat->M;
  info->columns_global = (double)mat->N;
  info->rows_local     = (double)mat->m;
  info->columns_local  = (double)mat->N;
  info->block_size     = 1.0;
  info->mallocs        = (double)mat->reallocs;
  isend[0] = mat->nz; isend[1] = mat->maxnz; isend[2] =  mat->maxnz -  mat->nz;
  isend[3] = A->mem;  isend[4] = info->mallocs;

  if (flag == MAT_LOCAL) {
    info->nz_used      = isend[0];
    info->nz_allocated = isend[1];
    info->nz_unneeded  = isend[2];
    info->memory       = isend[3];
    info->mallocs      = isend[4];
  } else if (flag == MAT_GLOBAL_MAX) {
    ierr = MPI_Allreduce(isend,irecv,3,MPI_INT,MPI_MAX,A->comm);CHKERRQ(ierr);
    info->nz_used      = irecv[0];
    info->nz_allocated = irecv[1];
    info->nz_unneeded  = irecv[2];
    info->memory       = irecv[3];
    info->mallocs      = irecv[4];
  } else if (flag == MAT_GLOBAL_SUM) {
    ierr = MPI_Allreduce(isend,irecv,3,MPI_INT,MPI_SUM,A->comm);CHKERRQ(ierr);
    info->nz_used      = irecv[0];
    info->nz_allocated = irecv[1];
    info->nz_unneeded  = irecv[2];
    info->memory       = irecv[3];
    info->mallocs      = irecv[4];
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatGetDiagonal_MPIRowbs"
int MatGetDiagonal_MPIRowbs(Mat mat,Vec v)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  BSsprow      **rs = a->A->rows;
  int          i, n, ierr;
  Scalar       *x, zero = 0.0;

  PetscFunctionBegin;
  if (mat->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");  
  if (!a->blocksolveassembly) {
    ierr = MatAssemblyEnd_MPIRowbs_ForBlockSolve(mat); CHKERRQ(ierr);
  }

  VecSet(&zero,v);
  VecGetArray(v,&x); VecGetLocalSize(v,&n);
  if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming mat and vec");
  for ( i=0; i<a->m; i++ ) {
    x[i] = rs[i]->nz[rs[i]->diag_ind]; 
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatDestroy_MPIRowbs"
int MatDestroy_MPIRowbs(Mat mat)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;
  BSspmat      *A = a->A;
  BSsprow      *vs;
  int          i, ierr;

  PetscFunctionBegin;
#if defined(USE_PETSC_LOG)
  PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",a->M,a->N);
#endif
  PetscFree(a->rowners); 

  ierr = StashDestroy_Private(&a->stash); CHKERRQ(ierr);
  if (a->bsmap) {
    if (a->bsmap->vlocal2global) PetscFree(a->bsmap->vlocal2global);
    if (a->bsmap->vglobal2local) PetscFree(a->bsmap->vglobal2local);
    if (a->bsmap->vglobal2proc)  (*a->bsmap->free_g2p)(a->bsmap->vglobal2proc);
    PetscFree(a->bsmap);
  } 

  if (A) {
    for (i=0; i<a->m; i++) {
      vs = A->rows[i];
      ierr = MatFreeRowbs_Private(mat,vs->length,vs->col,vs->nz); CHKERRQ(ierr);
    }
    /* Note: A->map = a->bsmap is freed above */
    PetscFree(A->rows);
    PetscFree(A);
  }
  if (a->procinfo) {BSfree_ctx(a->procinfo); CHKERRBS(0);}
  if (a->diag)     {ierr = VecDestroy(a->diag); CHKERRQ(ierr);}
  if (a->xwork)    {ierr = VecDestroy(a->xwork); CHKERRQ(ierr);}
  if (a->pA)       {BSfree_par_mat(a->pA); CHKERRBS(0);}
  if (a->fpA)      {BSfree_copy_par_mat(a->fpA); CHKERRBS(0);}
  if (a->comm_pA)  {BSfree_comm(a->comm_pA); CHKERRBS(0);}
  if (a->comm_fpA) {BSfree_comm(a->comm_fpA); CHKERRBS(0);}
  if (a->imax)     PetscFree(a->imax);
  PetscFree(a);
  if (mat->mapping) {
    ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
  }
  PLogObjectDestroy(mat);
  PetscHeaderDestroy(mat);

  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatSetOption_MPIRowbs"
int MatSetOption_MPIRowbs(Mat A,MatOption op)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) A->data;

  if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
  else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0; 
  else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
  else if (op == MAT_COLUMNS_UNSORTED)          a->sorted      = 0;
  else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
  else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
  else if (op == MAT_SYMMETRIC) {
    if (a->blocksolveassembly && !a->mat_is_symmetric) {
      SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Cannot set symmetric after use");
    }
    BSset_mat_symmetric(a->A,PETSC_TRUE);
    BSset_mat_icc_storage(a->A,PETSC_TRUE);
    a->mat_is_symmetric = 1;
    a->mat_is_structurally_symmetric = 1;
  } else if (op == MAT_STRUCTURALLY_SYMMETRIC) {
    a->mat_is_structurally_symmetric = 1;
  } else if (op == MAT_YES_NEW_DIAGONALS ||
             op == MAT_ROWS_SORTED || 
             op == MAT_NEW_NONZERO_LOCATION_ERROR ||
             op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
             op == MAT_ROWS_UNSORTED ||
             op == MAT_USE_HASH_TABLE) {
    PLogInfo(A,"MatSetOption_MPIRowbs:Option ignored\n");
  }  else if (op == MAT_COLUMN_ORIENTED) {
    SETERRQ(PETSC_ERR_SUP,0,"MAT_COLUMN_ORIENTED");
  }  else if (op == MAT_NO_NEW_DIAGONALS) {
    SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
  } else {
    SETERRQ(PETSC_ERR_SUP,0,"unknown option");
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatGetSize_MPIRowbs"
int MatGetSize_MPIRowbs(Mat mat,int *m,int *n)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;

  PetscFunctionBegin;
  *m = a->M; *n = a->N;
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatGetLocalSize_MPIRowbs"
int MatGetLocalSize_MPIRowbs(Mat mat,int *m,int *n)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;

  PetscFunctionBegin;
  *m = a->m; *n = a->n;
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatGetOwnershipRange_MPIRowbs"
int MatGetOwnershipRange_MPIRowbs(Mat A,int *m,int *n)
{
  Mat_MPIRowbs *mat = (Mat_MPIRowbs *) A->data;

  PetscFunctionBegin;
  *m = mat->rstart; *n = mat->rend;
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatGetRow_MPIRowbs"
int MatGetRow_MPIRowbs(Mat AA,int row,int *nz,int **idx,Scalar **v)
{
  Mat_MPIRowbs *mat = (Mat_MPIRowbs *) AA->data;
  BSspmat      *A = mat->A;
  BSsprow      *rs;
   
  PetscFunctionBegin;
  if (row < mat->rstart || row >= mat->rend) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Only local rows");

  rs  = A->rows[row - mat->rstart];
  *nz = rs->length;
  if (v)   *v   = rs->nz;
  if (idx) *idx = rs->col;
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatRestoreRow_MPIRowbs"
int MatRestoreRow_MPIRowbs(Mat A,int row,int *nz,int **idx,Scalar **v)
{
  PetscFunctionBegin;
  PetscFunctionReturn(0);
}

/* ------------------------------------------------------------------ */

#undef __FUNC__  
#define __FUNC__ "MatPrintHelp_MPIRowbs"
int MatPrintHelp_MPIRowbs(Mat A)
{
  static int called = 0; 
  MPI_Comm   comm = A->comm;

  PetscFunctionBegin;
  if (called) {PetscFunctionReturn(0);} else called = 1;
  (*PetscHelpPrintf)(comm," Options for MATMPIROWBS matrix format (needed for BlockSolve):\n");
  (*PetscHelpPrintf)(comm,"  -mat_rowbs_no_inode  - Do not use inodes\n");
  PetscFunctionReturn(0);
}

/* -------------------------------------------------------------------*/
extern int MatCholeskyFactorNumeric_MPIRowbs(Mat,Mat*);
extern int MatIncompleteCholeskyFactorSymbolic_MPIRowbs(Mat,IS,double,int,Mat *);
extern int MatLUFactorNumeric_MPIRowbs(Mat,Mat*);
extern int MatILUFactorSymbolic_MPIRowbs(Mat,IS,IS,double,int,Mat *);
extern int MatSolve_MPIRowbs(Mat,Vec,Vec);
extern int MatForwardSolve_MPIRowbs(Mat,Vec,Vec);
extern int MatBackwardSolve_MPIRowbs(Mat,Vec,Vec);

static struct _MatOps MatOps = {MatSetValues_MPIRowbs,
       MatGetRow_MPIRowbs,MatRestoreRow_MPIRowbs,
       MatMult_MPIRowbs,MatMultAdd_MPIRowbs, 
       MatMult_MPIRowbs,MatMultAdd_MPIRowbs,
       MatSolve_MPIRowbs,0,0,0,
       0,0,
       0,0,
       MatGetInfo_MPIRowbs,0,
       MatGetDiagonal_MPIRowbs,0,MatNorm_MPIRowbs,
       MatAssemblyBegin_MPIRowbs,MatAssemblyEnd_MPIRowbs,
       0,
       MatSetOption_MPIRowbs,MatZeroEntries_MPIRowbs,MatZeroRows_MPIRowbs,
       0,MatLUFactorNumeric_MPIRowbs,0,MatCholeskyFactorNumeric_MPIRowbs,
       MatGetSize_MPIRowbs,MatGetLocalSize_MPIRowbs,
       MatGetOwnershipRange_MPIRowbs,
       MatILUFactorSymbolic_MPIRowbs,
       MatIncompleteCholeskyFactorSymbolic_MPIRowbs,
       0,0,
       0,MatForwardSolve_MPIRowbs,MatBackwardSolve_MPIRowbs,
       0,0,0,
       0,0,0,0,MatPrintHelp_MPIRowbs,MatScale_MPIRowbs};

/* ------------------------------------------------------------------- */

#undef __FUNC__  
#define __FUNC__ "MatCreateMPIRowbs"
/*@C
   MatCreateMPIRowbs - Creates a sparse parallel matrix in the MATMPIROWBS
   format.  This format is intended primarily as an interface for BlockSolve95.

   Collective on MPI_Comm

   Input Parameters:
+  comm - MPI communicator
.  m - number of local rows (or PETSC_DECIDE to have calculated)
.  M - number of global rows (or PETSC_DECIDE to have calculated)
.  nz - number of nonzeros per row (same for all local rows)
.  nzz - number of nonzeros per row (possibly different for each row).
-  procinfo - optional BlockSolve95 BSprocinfo context (use PETSC_NULL
   for PETSc to create and initialize this context)

   Output Parameter:
.  newA - the matrix 

   The user MUST specify either the local or global matrix dimensions
   (possibly both).

   Specify the preallocated storage with either nz or nnz (not both).  Set 
   nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 
   allocation.

   Notes:
   By default, the matrix is assumed to be nonsymmetric; the user can
   take advantage of special optimizations for symmetric matrices by calling
$     MatSetOption(mat,MAT_SYMMETRIC)
   BEFORE calling the routine MatAssemblyBegin().

   Internally, the MATMPIROWBS format inserts zero elements to the
   matrix if necessary, so that nonsymmetric matrices are considered
   to be symmetric in terms of their sparsity structure; this format
   is required for use of the parallel communication routines within
   BlockSolve95. In particular, if the matrix element A[i,j] exists,
   then PETSc will internally allocate a 0 value for the element
   A[j,i] during MatAssemblyEnd() if the user has not already set
   a value for the matrix element A[j,i].

   Fortran Note:
   Fortran users should always use PETSC_NULL for procinfo.

   Options Database Keys:
.  -mat_rowbs_no_inode - Do not use inodes.
  
.keywords: matrix, row, symmetric, sparse, parallel, BlockSolve

.seealso: MatCreate(), MatSetValues()
@*/
int MatCreateMPIRowbs(MPI_Comm comm,int m,int M,int nz,int *nnz,void *procinfo,Mat *newA)
{
  Mat          A;
  Mat_MPIRowbs *a;
  BSmapping    *bsmap;
  BSoff_map    *bsoff;
  int          i, ierr, Mtemp, *offset, low, high,flg1,flg2,flg3;
  BSprocinfo   *bspinfo = (BSprocinfo *) procinfo;
  
  PetscFunctionBegin;
  *newA = 0;

  if (m == PETSC_DECIDE && nnz) {
    SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set nnz");
  }

  PetscHeaderCreate(A,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIROWBS,comm,MatDestroy,MatView);
  PLogObjectCreate(A);
  PLogObjectMemory(A,sizeof(struct _p_Mat));

  A->data = (void *) (a = PetscNew(Mat_MPIRowbs)); CHKPTRQ(a);
  PetscMemcpy(A->ops,&MatOps,sizeof(struct _MatOps));
  A->ops->destroy	= MatDestroy_MPIRowbs;
  A->ops->view	        = MatView_MPIRowbs;
  A->factor	        = 0;
  A->mapping            = 0;
  a->vecs_permscale     = 0;
  A->insertmode         = NOT_SET_VALUES;
  a->blocksolveassembly = 0;
  MPI_Comm_rank(comm,&a->rank);
  MPI_Comm_size(comm,&a->size);

  if (M != PETSC_DECIDE && m != PETSC_DECIDE) {
    /* Perhaps should be removed for better efficiency -- but could be risky. */
    ierr = MPI_Allreduce(&m,&Mtemp,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
    if (Mtemp != M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Sum of local dimensions!=global dimension");
  } else if (M == PETSC_DECIDE) {
    ierr = MPI_Allreduce(&m,&M,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
  } else if (m == PETSC_DECIDE) {
    m = M/a->size + ((M % a->size) > a->rank);
  } else {
    SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Must set local and/or global matrix size");
  }
  a->N = M;    A->N = M;
  a->M = M;    A->M = M;
  a->m = m;    A->m = m;
  a->n = a->N; A->n = A->N;  /* each row stores all columns */
  a->imax = (int *) PetscMalloc( (a->m+1)*sizeof(int) );CHKPTRQ(a->imax);
  a->mat_is_symmetric              = 0;
  a->mat_is_structurally_symmetric = 0;
  a->reallocs                      = 0;

  /* build local table of row ownerships */
  a->rowners    = (int *) PetscMalloc((a->size+2)*sizeof(int));CHKPTRQ(a->rowners);
  ierr          = MPI_Allgather(&m,1,MPI_INT,a->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
  a->rowners[0] = 0;
  for ( i=2; i<=a->size; i++ ) {
    a->rowners[i] += a->rowners[i-1];
  }
  a->rstart = a->rowners[a->rank]; 
  a->rend   = a->rowners[a->rank+1]; 
  PLogObjectMemory(A,(a->m+a->size+3)*sizeof(int));

  /* build cache for off array entries formed */
  ierr = StashBuild_Private(&a->stash); CHKERRQ(ierr);

  /* Initialize BlockSolve information */
  a->A	      = 0;
  a->pA	      = 0;
  a->comm_pA  = 0;
  a->fpA      = 0;
  a->comm_fpA = 0;
  a->alpha    = 1.0;
  a->ierr     = 0;
  a->failures = 0;
  ierr = VecCreateMPI(A->comm,a->m,a->M,&(a->diag)); CHKERRQ(ierr);
  ierr = VecDuplicate(a->diag,&(a->xwork));CHKERRQ(ierr);
  PLogObjectParent(A,a->diag);  PLogObjectParent(A,a->xwork);
  PLogObjectMemory(A,(a->m+1)*sizeof(Scalar));
  if (!bspinfo) {bspinfo = BScreate_ctx(); CHKERRBS(0);}
  a->procinfo = bspinfo;
  BSctx_set_id(bspinfo,a->rank); CHKERRBS(0);
  BSctx_set_np(bspinfo,a->size); CHKERRBS(0);
  /*
    With some older releases of BlockSolve95 you need to have the 
   cast of (ProcSet *) below.
  */
  BSctx_set_ps(bspinfo,/* (ProcSet*) */comm); CHKERRBS(0);
  BSctx_set_cs(bspinfo,INT_MAX); CHKERRBS(0);
  BSctx_set_is(bspinfo,INT_MAX); CHKERRBS(0);
  BSctx_set_ct(bspinfo,IDO); CHKERRBS(0);
#if defined(USE_PETSC_DEBUG)
  BSctx_set_err(bspinfo,1); CHKERRBS(0);  /* BS error checking */
#endif
  BSctx_set_rt(bspinfo,1); CHKERRBS(0);
  ierr = OptionsHasName(PETSC_NULL,"-log_info",&flg1); CHKERRQ(ierr);
  if (flg1) {
    BSctx_set_pr(bspinfo,1); CHKERRBS(0);
  }
  ierr = OptionsHasName(PETSC_NULL,"-pc_ilu_factorpointwise",&flg1);CHKERRQ(ierr);
  ierr = OptionsHasName(PETSC_NULL,"-pc_icc_factorpointwise",&flg2);CHKERRQ(ierr);
  ierr = OptionsHasName(PETSC_NULL,"-mat_rowbs_no_inode",&flg3);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {
    BSctx_set_si(bspinfo,1); CHKERRBS(0);
  } else {
    BSctx_set_si(bspinfo,0); CHKERRBS(0);
  }
#if defined(USE_PETSC_LOG)
  MLOG_INIT();  /* Initialize logging */
#endif

  /* Compute global offsets */
  ierr = MatGetOwnershipRange(A,&low,&high); CHKERRQ(ierr);
  offset = &low;

  a->bsmap = (BSmapping  *) PetscNew(BSmapping); CHKPTRQ(a->bsmap);
  PLogObjectMemory(A,sizeof(BSmapping));
  bsmap = a->bsmap;
  bsmap->vlocal2global	= (int *) PetscMalloc(sizeof(int));CHKPTRQ(bsmap->vlocal2global);
  *((int *)bsmap->vlocal2global) = (*offset);
  bsmap->flocal2global	= BSloc2glob;
  bsmap->free_l2g	= 0;
  bsmap->vglobal2local	= (int *) PetscMalloc(sizeof(int));CHKPTRQ(bsmap->vglobal2local);
  *((int *)bsmap->vglobal2local) = (*offset);
  bsmap->fglobal2local	= BSglob2loc;
  bsmap->free_g2l	= 0;
  bsoff = BSmake_off_map( *offset, bspinfo, a->M );
  bsmap->vglobal2proc	= (void *)bsoff;
  bsmap->fglobal2proc	= BSglob2proc;
  bsmap->free_g2p	= (void(*)(void*)) BSfree_off_map;

  ierr = MatCreateMPIRowbs_local(A,nz,nnz); CHKERRQ(ierr);
  ierr = OptionsHasName(PETSC_NULL,"-help",&flg1); CHKERRQ(ierr);
  if (flg1) {
    ierr = MatPrintHelp(A); CHKERRQ(ierr);
  }
  *newA = A;
  PetscFunctionReturn(0);
}
/* --------------- extra BlockSolve-specific routines -------------- */
#undef __FUNC__  
#define __FUNC__ "MatGetBSProcinfo"
/* @
  MatGetBSProcinfo - Gets the BlockSolve BSprocinfo context, which the
  user can then manipulate to alter the default parameters.

  Input Parameter:
  mat - matrix

  Output Parameter:
  procinfo - processor information context

  Note:
  This routine is valid only for matrices stored in the MATMPIROWBS
  format.
@ */
int MatGetBSProcinfo(Mat mat,BSprocinfo *procinfo)
{
  Mat_MPIRowbs *a = (Mat_MPIRowbs *) mat->data;

  PetscFunctionBegin;
  if (mat->type != MATMPIROWBS) SETERRQ(PETSC_ERR_ARG_WRONG,0,"For MATMPIROWBS matrix type");
  procinfo = a->procinfo;
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatLoad_MPIRowbs"
int MatLoad_MPIRowbs(Viewer viewer,MatType type,Mat *newmat)
{
  Mat_MPIRowbs *a;
  BSspmat      *A;
  BSsprow      **rs;
  Mat          mat;
  int          i, nz, ierr, j,rstart, rend, fd, *ourlens,*sndcounts = 0,*procsnz;
  int          header[4],rank,size,*rowlengths = 0,M,m,*rowners,maxnz,*cols;
  Scalar       *vals;
  MPI_Comm     comm = ((PetscObject)viewer)->comm;
  MPI_Status   status;

  PetscFunctionBegin;
  MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
  if (!rank) {
    ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
    ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
    if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"Not matrix object");
    if (header[3] < 0) {
      SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIRowbs");
    }
  }


  ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
  M = header[1]; 
  /* determine ownership of all rows */
  m          = M/size + ((M % size) > rank);
  rowners    = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
  ierr       = MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
  rowners[0] = 0;
  for ( i=2; i<=size; i++ ) {
    rowners[i] += rowners[i-1];
  }
  rstart = rowners[rank]; 
  rend   = rowners[rank+1]; 

  /* distribute row lengths to all processors */
  ourlens = (int*) PetscMalloc( (rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
  if (!rank) {
    rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
    ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
    sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
    for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
    ierr = MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
    PetscFree(sndcounts);
  } else {
    ierr = MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);CHKERRQ(ierr);
  }

  /* create our matrix */
  ierr = MatCreateMPIRowbs(comm,m,M,0,ourlens,PETSC_NULL,newmat); CHKERRQ(ierr);
  mat = *newmat;
  PetscFree(ourlens);

  a = (Mat_MPIRowbs *) mat->data;
  A = a->A;
  rs = A->rows;

  if (!rank) {
    /* calculate the number of nonzeros on each processor */
    procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
    PetscMemzero(procsnz,size*sizeof(int));
    for ( i=0; i<size; i++ ) {
      for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
        procsnz[i] += rowlengths[j];
      }
    }
    PetscFree(rowlengths);

    /* determine max buffer needed and allocate it */
    maxnz = 0;
    for ( i=0; i<size; i++ ) {
      maxnz = PetscMax(maxnz,procsnz[i]);
    }
    cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);

    /* read in my part of the matrix column indices  */
    nz = procsnz[0];
    ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
    
    /* insert it into my part of matrix */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<a->imax[i]; j++) {
        rs[i]->col[j] = cols[nz++];
      }
      rs[i]->length = a->imax[i];
    }
    /* read in parts for all other processors */
    for ( i=1; i<size; i++ ) {
      nz   = procsnz[i];
      ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
      ierr = MPI_Send(cols,nz,MPI_INT,i,mat->tag,comm);CHKERRQ(ierr);
    }
    PetscFree(cols);
    vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);

    /* read in my part of the matrix numerical values  */
    nz   = procsnz[0];
    ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
    
    /* insert it into my part of matrix */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<a->imax[i]; j++) {
        rs[i]->nz[j] = vals[nz++];
      }
    }
    /* read in parts for all other processors */
    for ( i=1; i<size; i++ ) {
      nz   = procsnz[i];
      ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
      ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,mat->tag,comm);CHKERRQ(ierr);
    }
    PetscFree(vals); PetscFree(procsnz);
  } else {
    /* determine buffer space needed for message */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      nz += a->imax[i];
    }
    cols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(cols);

    /* receive message of column indices*/
    ierr = MPI_Recv(cols,nz,MPI_INT,0,mat->tag,comm,&status);CHKERRQ(ierr);
    ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
    if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong");

    /* insert it into my part of matrix */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<a->imax[i]; j++) {
        rs[i]->col[j] = cols[nz++];
      }
      rs[i]->length = a->imax[i];
    }
    PetscFree(cols);
    vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);

    /* receive message of values*/
    ierr = MPI_Recv(vals,nz,MPIU_SCALAR,0,mat->tag,comm,&status);CHKERRQ(ierr);
    ierr = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
    if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong");

    /* insert it into my part of matrix */
    nz = 0;
    for ( i=0; i<A->num_rows; i++ ) {
      for (j=0; j<a->imax[i]; j++) {
        rs[i]->nz[j] = vals[nz++];
      }
      rs[i]->length = a->imax[i];
    }
    PetscFree(vals);
 
  }
  PetscFree(rowners);
  a->nz = a->maxnz;
  ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

/* 
    Special destroy and view routines for factored matrices 
*/
#undef __FUNC__  
#define __FUNC__ "MatDestroy_MPIRowbs_Factored"
static int MatDestroy_MPIRowbs_Factored(Mat mat)
{
  PetscFunctionBegin;
#if defined(USE_PETSC_LOG)
  PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",mat->M,mat->N);
#endif
  PLogObjectDestroy(mat);
  PetscHeaderDestroy(mat);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatView_MPIRowbs_Factored"
static int MatView_MPIRowbs_Factored(Mat mat,Viewer viewer)
{
  int ierr;

  PetscFunctionBegin;
  ierr = MatView((Mat) mat->data,viewer); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "MatIncompleteCholeskyFactorSymbolic_MPIRowbs"
int MatIncompleteCholeskyFactorSymbolic_MPIRowbs(Mat mat,IS isrow,
                                      double f,int fill,Mat *newfact)
{
  /* Note:  f is not currently used in BlockSolve */
  Mat          newmat;
  Mat_MPIRowbs *mbs = (Mat_MPIRowbs *) mat->data;
  int          ierr;
  PetscTruth   idn;

  PetscFunctionBegin;
  if (isrow) {
    ierr = ISIdentity(isrow,&idn); CHKERRQ(ierr);
    if (!idn) SETERRQ(PETSC_ERR_SUP,0,"Only identity row permutation supported");
  }
  if (!mbs->blocksolveassembly) {
    ierr = MatSetOption(mat,MAT_SYMMETRIC);CHKERRQ(ierr);
    ierr = MatAssemblyEnd_MPIRowbs_ForBlockSolve(mat); CHKERRQ(ierr);
  }

  if (!mbs->mat_is_symmetric) {
    SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"To use incomplete Cholesky \n\
        preconditioning with a MATMPIROWBS matrix you must declare it to be \n\
        symmetric using the option MatSetOption(A,MAT_SYMMETRIC)");
  }

  /* Copy permuted matrix */
  if (mbs->fpA) {BSfree_copy_par_mat(mbs->fpA); CHKERRBS(0);}
  mbs->fpA = BScopy_par_mat(mbs->pA); CHKERRBS(0);

  /* Set up the communication for factorization */
  if (mbs->comm_fpA) {BSfree_comm(mbs->comm_fpA); CHKERRBS(0);}
  mbs->comm_fpA = BSsetup_factor(mbs->fpA,mbs->procinfo); CHKERRBS(0);

  /* 
      Create a new Mat structure to hold the "factored" matrix, 
    not this merely contains a pointer to the original matrix, since
    the original matrix contains the factor information.
  */
  PetscHeaderCreate(newmat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIROWBS,mat->comm,MatDestroy,MatView);
  PLogObjectCreate(newmat);
  PLogObjectMemory(newmat,sizeof(struct _p_Mat));

  newmat->data          = (void *) mat;
  PetscMemcpy(newmat->ops,&MatOps,sizeof(struct _MatOps));
  newmat->ops->destroy	= MatDestroy_MPIRowbs_Factored;
  newmat->ops->view     = MatView_MPIRowbs_Factored;
  newmat->factor	= 1;
  newmat->M             = mat->M;
  newmat->N             = mat->N;
  newmat->m             = mat->m;
  newmat->n             = mat->n;

  *newfact = newmat; 
  PetscFunctionReturn(0); 
}

#undef __FUNC__  
#define __FUNC__ "MatILUFactorSymbolic_MPIRowbs"
int MatILUFactorSymbolic_MPIRowbs(Mat mat,IS isrow,IS iscol,
                                      double f,int fill,Mat *newfact)
{
  /* Note:  f is not currently used in BlockSolve */
  Mat          newmat;
  Mat_MPIRowbs *mbs = (Mat_MPIRowbs *) mat->data;
  int          ierr;
  PetscTruth   idn;
  PetscFunctionBegin;

  if (isrow) {
    ierr = ISIdentity(isrow,&idn); CHKERRQ(ierr);
    if (!idn) SETERRQ(PETSC_ERR_SUP,0,"Only identity row permutation supported");
  }
  if (iscol) {
    ierr = ISIdentity(iscol,&idn); CHKERRQ(ierr);
    if (!idn) SETERRQ(PETSC_ERR_SUP,0,"Only identity column permutation supported");
  }

  if (!mbs->blocksolveassembly) {
    ierr = MatAssemblyEnd_MPIRowbs_ForBlockSolve(mat); CHKERRQ(ierr);
  }
 
  if (mbs->mat_is_symmetric) {
    SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"To use ILU preconditioner with \n\
        MatCreateMPIRowbs() matrix you CANNOT declare it to be a symmetric matrix\n\
        using the option MatSetOption(A,MAT_SYMMETRIC)");
  }

  /* Copy permuted matrix */
  if (mbs->fpA) {BSfree_copy_par_mat(mbs->fpA); CHKERRBS(0);}
  mbs->fpA = BScopy_par_mat(mbs->pA); CHKERRBS(0); 

  /* Set up the communication for factorization */
  if (mbs->comm_fpA) {BSfree_comm(mbs->comm_fpA); CHKERRBS(0);}
  mbs->comm_fpA = BSsetup_factor(mbs->fpA,mbs->procinfo); CHKERRBS(0);

  /* 
      Create a new Mat structure to hold the "factored" matrix, 
    not this merely contains a pointer to the original matrix, since
    the original matrix contains the factor information.
  */
  PetscHeaderCreate(newmat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIROWBS,mat->comm,MatDestroy,MatView);
  PLogObjectCreate(newmat);
  PLogObjectMemory(newmat,sizeof(struct _p_Mat));

  newmat->data          = (void *) mat;
  PetscMemcpy(newmat->ops,&MatOps,sizeof(struct _MatOps));
  newmat->ops->destroy	= MatDestroy_MPIRowbs_Factored;
  newmat->ops->view     = MatView_MPIRowbs_Factored;
  newmat->factor	= 1;
  newmat->M             = mat->M;
  newmat->N             = mat->N;
  newmat->m             = mat->m;
  newmat->n             = mat->n;

  *newfact = newmat; 
  PetscFunctionReturn(0); 
}

#else
#include "petsc.h"
#include "mat.h"
#undef __FUNC__  
#define __FUNC__ "MatCreateMPIRowbs"
int MatCreateMPIRowbs(MPI_Comm comm,int m,int M,int nz,int *nnz,void *info,Mat *newmat)
{
  PetscFunctionBegin;
  SETERRQ(PETSC_ERR_LIB,0,"This matrix format requires BlockSolve95.\n\
  See ${PETSC_DIR}/docs/nstallation.html for information on using PETSc with BlockSolve95.");
}
#endif





