#ifdef PETSC_RCS_HEADER
static char vcid[] = "$Id: baijpc.c,v 1.27 1998/04/03 23:15:42 bsmith Exp $";
#endif

/*
   Defines a block Jacobi preconditioner for the SeqBAIJ format.
*/
#include <math.h>
#include "src/mat/impls/baij/mpi/mpibaij.h"
#include "src/pc/pcimpl.h"
#include "src/vec/vecimpl.h"
#include "src/pc/impls/bjacobi/bjacobi.h"
#include "sles.h"

typedef struct {
  int m,m_blocks, nzl, nzu;       
  int *blocks;
  int *rlens;
  int *rlensb;
} PC_GSBAIJ;

typedef struct {
  Vec       *x,*y;             /* work vectors for solves on each block */
  int       *starts;           /* starting point of each block */
  Mat       *mat,*pmat;        /* submatrices for each block */
  IS        *is;               /* for gathering the submatrices */
  PC_GSBAIJ gsdata;            /* additional data for when running with Gauss-Seidel */
} PC_BJacobi_BAIJ;

#undef __FUNC__  
#define __FUNC__ "PCDestroy_BJacobi_BAIJ"
int PCDestroy_BJacobi_BAIJ(PC pc)
{
  PC_BJacobi      *jac = (PC_BJacobi *) pc->data;
  PC_BJacobi_BAIJ *bjac = (PC_BJacobi_BAIJ *) jac->data;
  int             i,ierr;

  PetscFunctionBegin;
  ierr = MatDestroyMatrices(jac->n_local,&bjac->pmat); CHKERRQ(ierr);
  if (jac->use_true_local) {
    ierr = MatDestroyMatrices(jac->n_local,&bjac->mat); CHKERRQ(ierr);
  }

  for (i=0; i<jac->n_local; i++) {
    ierr = SLESDestroy(jac->sles[i]); CHKERRQ(ierr);
    ierr = VecDestroy(bjac->x[i]); CHKERRQ(ierr);
    ierr = VecDestroy(bjac->y[i]); CHKERRQ(ierr);
    ierr = ISDestroy(bjac->is[i]); CHKERRQ(ierr);
  }
  PetscFree(jac->sles);
  PetscFree(bjac->x);
  PetscFree(bjac->starts);
  PetscFree(bjac->is);
  if (jac->gs) {
    PetscFree(bjac->gsdata.blocks);
    PetscFree(bjac->gsdata.rlens);
    PetscFree(bjac->gsdata.rlensb);
  }
  PetscFree(bjac);
  if (jac->l_lens) PetscFree(jac->l_lens);
  if (jac->g_lens) PetscFree(jac->g_lens);
  PetscFree(jac); 
  PetscFunctionReturn(0);
}

#undef __FUNC__  
#define __FUNC__ "PCSetUpOnBlocks_BJacobi_BAIJ"
int PCSetUpOnBlocks_BJacobi_BAIJ(PC pc)
{
  PC_BJacobi      *jac = (PC_BJacobi *) pc->data;
  int             ierr,i,n_local = jac->n_local;
  PC_BJacobi_BAIJ *bjac = (PC_BJacobi_BAIJ *) jac->data;

  PetscFunctionBegin;
  for (i=0; i<n_local; i++) {
    ierr = SLESSetUp(jac->sles[i],bjac->x[i],bjac->y[i]); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}

/*
      Preconditioner for block Jacobi 
*/
#undef __FUNC__  
#define __FUNC__ "PCApply_BJacobi_BAIJ"
int PCApply_BJacobi_BAIJ(PC pc,Vec x, Vec y)
{
  PC_BJacobi      *jac = (PC_BJacobi *) pc->data;
  int             ierr,its,i,n_local = jac->n_local;
  PC_BJacobi_BAIJ *bjac = (PC_BJacobi_BAIJ *) jac->data;
  Scalar          *xin,*yin,*lx_array,*ly_array;
   
  PetscFunctionBegin;
  ierr = VecGetArray(x,&xin);CHKERRQ(ierr);
  ierr = VecGetArray(y,&yin);CHKERRQ(ierr);
  for (i=0; i<n_local; i++) {
    /* 
       To avoid copying the subvector from x into a workspace we instead 
       make the workspace vector array point to the subpart of the array of
       the global vector.
    */
    ierr = VecGetArray(bjac->x[i],&lx_array);CHKERRQ(ierr); 
    ierr = VecGetArray(bjac->y[i],&ly_array);CHKERRQ(ierr); 
    ierr = VecPlaceArray(bjac->x[i],xin+bjac->starts[i]);CHKERRQ(ierr);
    ierr = VecPlaceArray(bjac->y[i],yin+bjac->starts[i]);CHKERRQ(ierr); 
    ierr = SLESSolve(jac->sles[i],bjac->x[i],bjac->y[i],&its); CHKERRQ(ierr);
    ierr = VecPlaceArray(bjac->x[i],lx_array); CHKERRQ(ierr);
    ierr = VecPlaceArray(bjac->y[i],ly_array); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}

/*
     Preconditioner for block Gauss-Seidel
*/
#undef __FUNC__  
#define __FUNC__ "PCApply_BGS_BAIJ"
int PCApply_BGS_BAIJ(PC pc,Vec x, Vec y)
{
  PC_BJacobi      *jac = (PC_BJacobi *) pc->data;
  Mat_SeqBAIJ     *A;
  Mat_MPIBAIJ     *Ampi;
  int             ierr,its,i,j,k,l,n_local = jac->n_local,n,row,*idx;
  int             max,*ai, *blks, *aj,*st,*lc,*uc,tmp,bs,bs2;
  PC_BJacobi_BAIJ *bjac = (PC_BJacobi_BAIJ *) jac->data;
  Scalar          *xin,*yin,*lx_array,*ly_array,sum,*b,*v,*yins, *aa;
  PC_GSBAIJ       *gs = &bjac->gsdata;
  Vec             *lx, *ly;
  SLES            *lsles ;

  PetscFunctionBegin;
  PLogEventBegin(PC_Apply,pc,x,y,0);
  /* access local matrix data (in parallel case this is the diagonal block */
  if (pc->mat->type == MATSEQBAIJ) {
    A = (Mat_SeqBAIJ*) pc->mat->data;
  } else {
    Ampi = (Mat_MPIBAIJ*) pc->mat->data;
    A    = (Mat_SeqBAIJ*) Ampi->A->data; 
  }
  ierr = VecGetArray(x,&xin);CHKERRQ(ierr);
  ierr = VecGetArray(y,&yin);CHKERRQ(ierr);
  yins = yin;

  /*some shorts to ptrs*/
  aa   = A->a;
  aj   = A->j;
  ai   = A->i;
  bs   = A->bs;
  bs2  = bs*bs;
  blks = gs->blocks;
  lc   = gs->rlens;     /* lower count */
  uc   = gs->rlensb;    /* upper count */
  st   = bjac->starts;
  lx   = bjac->x;
  ly   = bjac->y;
  lsles= jac->sles;
  /*for both PCBGS_FORWARD_SWEEP and PCBGS_SYMMETRIC_SWEEP */
  for (i=0; i<n_local; i++) {
    row  = blks[i];
    ierr = VecGetArray(lx[i],&lx_array);CHKERRQ(ierr);
    b    = xin + st[i];
    
    /* do rectangular multiply (D+L)'b */
    for (j=0, max=blks[i+1]-blks[i]; j<max; j++) {
      for (k=0; k<bs; k++) { /* for each row in the block_row */
        idx = aj + ai[row];
        n   = lc[row];
        v   = aa + ai[row]*bs2 + k;
        sum = b[j*bs+k];
        while(n--) {
          for (l=0; l<bs; l++) {
            sum -= *v * yins[(*idx)*bs+l]; 
            v   += bs;
          }
          idx ++;
        }
        lx_array[j*bs+k] = sum;
      }
      row++;
    }
    ierr = VecGetArray(ly[i],&ly_array);CHKERRQ(ierr);
    ierr = VecPlaceArray(ly[i],yin+st[i]);CHKERRQ(ierr); 
    ierr = SLESSolve(lsles[i],lx[i],ly[i],&its); CHKERRQ(ierr);
    ierr = VecPlaceArray(ly[i],ly_array);CHKERRQ(ierr);
  }
  PLogFlops(2*gs->nzl);    /* End Forward sweep*/

  if (jac->gstype == PCBGS_SYMMETRIC_SWEEP) {
    for (i=n_local-1; i>=0; i--) {
      row  = blks[i];
      ierr = VecGetArray(lx[i],&lx_array);CHKERRQ(ierr);
      b    = xin + st[i];

      for (j=0, max=blks[i+1]-blks[i]; j<max; j++) {
        for (k=0; k<bs; k++) {
          /*do U*/
          tmp =  ai[row] + uc[row];
          v   = aa + tmp*bs2 +k;
          idx = aj + tmp;
          n   = ai[row+1] - tmp;
          sum = b[j*bs+k];
          while (n--) {
            for (l=0; l<bs; l++) {
              sum -= *v * yins[(*idx)*bs+l];
              v += bs;
            }
            idx++;
          } 
          /*do L*/
          
          v   = aa + ai[row]*bs2 + k;
          idx = aj + ai[row];
          n   = lc[row];
          while (n--) {
            for (l=0; l<bs; l++) {
              sum -= *v * yins[(*idx)*bs+l];
              v += bs;
            }
            idx++;
          }
          lx_array[j*bs+k] = sum;
        }
        row++;
      }
      
      ierr = VecGetArray(ly[i],&ly_array);CHKERRQ(ierr);
      ierr = VecPlaceArray(ly[i],yin+st[i]);CHKERRQ(ierr); 
      ierr = SLESSolve(lsles[i],lx[i],ly[i],&its); CHKERRQ(ierr);
      ierr = VecPlaceArray(ly[i],ly_array);CHKERRQ(ierr);
    }
    PLogFlops(2*(gs->nzl+gs->nzu));
  }
  PLogEventEnd(PC_Apply,pc,x,y,0);

  PetscFunctionReturn(0);
}


extern int PCSetUp_BJacobi_MPIBAIJ(PC);
extern int PCSetUp_BJacobi_SeqBAIJ(PC);

#undef __FUNC__  
#define __FUNC__ "PCSetUp_BJacobi_BAIJ"
int PCSetUp_BJacobi_BAIJ(PC pc)
{
  PC_BJacobi          *jac = (PC_BJacobi *) pc->data;
  Mat                 mat = pc->mat, pmat = pc->pmat;
  int                 ierr, m, n_local, N, M, start, i, rank, size, sum, end, i_start;
  int                 bs, i_end;
  char                *prefix;
  SLES                sles;
  Vec                 x,y;
  PC_BJacobi_BAIJ     *bjac = (PC_BJacobi_BAIJ *) jac->data;
  KSP                 subksp;
  PC                  subpc;
  IS                  is;
  MatGetSubMatrixCall scall = MAT_REUSE_MATRIX;

  PetscFunctionBegin;
  MPI_Comm_rank(pc->comm,&rank);
  MPI_Comm_size(pc->comm,&size);
  ierr = MatGetLocalSize(pc->pmat,&M,&N); CHKERRQ(ierr);
  ierr = MatGetBlockSize(pc->pmat,&bs); CHKERRQ(ierr);

  /*   local block count  given */
  if (jac->n_local > 0 && jac->n < 0) {
    ierr = MPI_Allreduce(&jac->n_local,&jac->n,1,MPI_INT,MPI_SUM,pc->comm);CHKERRQ(ierr);
    if (jac->l_lens) { /* check that user set these correctly */
      sum = 0;
      for (i=0; i<jac->n_local; i++) {
        if (jac->l_lens[i]/bs*bs !=jac->l_lens[i]) {
          SETERRQ(PETSC_ERR_ARG_SIZ,0,"Mat blocksize doesn't match block Jacobi layout");
        }
        sum += jac->l_lens[i];
      }
      if (sum != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Local lens sent incorrectly");
    }
  }
  /* global block count given */
  else if (jac->n > 0 && jac->n_local < 0) {
    /* global blocks given: determine which ones are local */
    if (jac->g_lens) {
      /* check if the g_lens is has valid entries */
      for (i=0; i<jac->n; i++) {
        if (!jac->g_lens[i]) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Zero block not allowed");
        if (jac->g_lens[i]/bs*bs != jac->g_lens[i]) {
          SETERRQ(PETSC_ERR_ARG_SIZ,0,"Mat blocksize doesn't match block Jacobi layout");
        }
      }
      if (size == 1) {
        jac->n_local = jac->n;
        jac->l_lens  = (int *) PetscMalloc(jac->n_local*sizeof(int));CHKPTRQ(jac->l_lens);
        PetscMemcpy(jac->l_lens,jac->g_lens,jac->n_local*sizeof(int));
        /* check that user set these correctly */
        sum = 0;
        for (i=0; i<jac->n_local; i++) sum += jac->l_lens[i];
        if (sum != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Global lens sent incorrectly");
      } else {
        MatGetOwnershipRange(pc->pmat,&start,&end);
        /* loop over blocks determing first one owned by me */
        sum = 0;
        for (i=0; i<jac->n+1; i++) {
          if (sum == start) { i_start = i; goto start_1;}
          if (i < jac->n) sum += jac->g_lens[i];
        }
        SETERRQ(PETSC_ERR_ARG_SIZ,0,"Block sizes\n\
                   used in PCBJacobiSetTotalBlocks()\n\
                   are not compatible with parallel matrix layout");
 start_1: 
        for (i=i_start; i<jac->n+1; i++) {
          if (sum == end) { i_end = i; goto end_1; }
          if (i < jac->n) sum += jac->g_lens[i];
        }          
        SETERRQ(PETSC_ERR_ARG_SIZ,0,"Block sizes\n\
                      used in PCBJacobiSetTotalBlocks()\n\
                      are not compatible with parallel matrix layout");
 end_1: 
        jac->n_local = i_end - i_start;
        jac->l_lens = (int *) PetscMalloc(jac->n_local*sizeof(int));CHKPTRQ(jac->l_lens); 
        PetscMemcpy(jac->l_lens,jac->g_lens+i_start,jac->n_local*sizeof(int));
      }
    }
    /* no global blocks given, determine then using default layout */
    else {
      jac->n_local = jac->n/size + ((jac->n % size) > rank);
      jac->l_lens  = (int *) PetscMalloc(jac->n_local*sizeof(int));CHKPTRQ(jac->l_lens);
      for (i=0; i<jac->n_local; i++) {
        jac->l_lens[i] = ((M/bs)/jac->n_local + (((M/bs) % jac->n_local) > i))*bs;
        if (!jac->l_lens[i]) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Too many blocks given");
      }
    }
  }
  /* no blocks given */
  else if (jac->n < 0 && jac->n_local < 0) {
    jac->n         = size;
    jac->n_local   = 1;
    jac->l_lens    = (int *) PetscMalloc(sizeof(int));CHKPTRQ(jac->l_lens);
    jac->l_lens[0] = M;
  }

  /* special case of one block on the processor */
  if (pmat->type == MATMPIBAIJ && jac->n_local == 1  && !pc->modifysubmatrices) {
    ierr = PCSetUp_BJacobi_MPIBAIJ(pc);CHKERRQ(ierr);
    PetscFunctionReturn(0);
  } else if (pmat->type == MATSEQBAIJ && jac->n_local == 1  && !pc->modifysubmatrices) {
    ierr = PCSetUp_BJacobi_SeqBAIJ(pc);CHKERRQ(ierr);
    PetscFunctionReturn(0);
  }

  /* if parallel matrix extract out diagonal block */
  if (pmat->type == MATMPIBAIJ) {
    Mat_MPIBAIJ *tmat  = (Mat_MPIBAIJ*) mat->data;
    Mat_MPIBAIJ *tpmat = (Mat_MPIBAIJ*) pmat->data;
    mat   = tmat->A;
    pmat  = tpmat->A;
  }

  n_local = jac->n_local;

  if (jac->use_true_local) {
    if (pc->mat->type != pc->pmat->type) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices not of same type");
  }

  /* set default direct solver with no Krylov method */
  if (!pc->setupcalled) {
    scall             = MAT_INITIAL_MATRIX;
    pc->destroy       = PCDestroy_BJacobi_BAIJ;
    pc->apply         = PCApply_BJacobi_BAIJ;
    pc->setuponblocks = PCSetUpOnBlocks_BJacobi_BAIJ;

    bjac         = (PC_BJacobi_BAIJ *) PetscMalloc(sizeof(PC_BJacobi_BAIJ));CHKPTRQ(bjac);
    PLogObjectMemory(pc,sizeof(PC_BJacobi_BAIJ));
    jac->sles    = (SLES*) PetscMalloc(n_local*sizeof(SLES)); CHKPTRQ(jac->sles);
    bjac->x      = (Vec*) PetscMalloc(2*n_local*sizeof(Vec)); CHKPTRQ(bjac->x);
    bjac->y      = bjac->x + n_local;
    bjac->starts = (int*) PetscMalloc(n_local*sizeof(Scalar));CHKPTRQ(bjac->starts);
    
    jac->data    = (void *) bjac;
    bjac->is     = (IS *) PetscMalloc(n_local*sizeof(IS)); CHKPTRQ(bjac->is);

    /* 
        construct data structure for block triangular solves 
        when block Gauss-Seidel is being used 
    */
    if (jac->gs) {
      Mat_SeqBAIJ  *baij =  (Mat_SeqBAIJ  *) pmat->data;
      PC_GSBAIJ    *gs;
      int          p, q, r, nz, nz2, ct1, ct2;
      
      pc->apply    = PCApply_BGS_BAIJ; /* user Gauss-Seidel preconditioner */
      gs           = &bjac->gsdata;
      gs->m        = baij->mbs;
      gs->m_blocks = n_local;
      gs->blocks   = (int *) PetscMalloc((gs->m_blocks+1)*sizeof(int));CHKPTRQ(gs->blocks);
      gs->rlens    = (int *) PetscMalloc(gs->m*sizeof(int));CHKPTRQ(gs->rlens);
      gs->rlensb   = (int *) PetscMalloc(gs->m*sizeof(int));CHKPTRQ(gs->rlensb);
      gs->blocks[0] = 0;
      for (i=1; i<gs->m_blocks; i++) {
	gs->blocks[i] = gs->blocks[i-1] + jac->l_lens[i-1]/bs; /* div by bs ? */
      }
      gs->blocks[gs->m_blocks] = gs->m;
      /* for (i=0; i<=gs->m_blocks; i++) {
	printf("blocks[%d]: %d   llens = %d \n",i,gs->blocks[i],jac->l_lens[i]); 
      } */

      ct1 = 0; ct2 = 0;
      for (p=0; p<(gs->m_blocks); p++) {
	for (q=gs->blocks[p]; q<gs->blocks[p+1]; q++) {
	  nz=0;
	  nz2=0;
	  for (r=baij->i[q]; r<baij->i[q+1]; r++) {
	    if ((baij->j[r]) < gs->blocks[p]) {nz++;}
	    if ((baij->j[r]) < (gs->blocks[p+1])) {nz2++;}
	    else {break;}
	  }
	  gs->rlens[q] = nz;
	  gs->rlensb[q]= nz2;
          ct1         += nz;
          ct2         += nz2;
          /* printf("rlens[%d]: %d rlensb[%d]: %d\n",q,gs->rlens[q],q,gs->rlensb[q]); */
	}
      }
      gs->nzl = ct1;
      gs->nzu = baij->nz - ct2;
#ifdef USE_PETSC_LOG      
      {
        int ai,bs2=bs*bs;
        Scalar *aa, *v,val;
        double norm, sum1;
        
        aa = baij->a;
        for( p=0,sum1=0.0; p< baij->mbs; ++p ){
          ai = baij->i[p];
          for( q=0; q<gs->rlens[p]; ++q ){
            v =  aa+(ai+q)*bs2;
            for (r=0; r<bs2; r++) {
              val = v[r];
#if defined(USE_PETSC_COMPLEX)
              sum1 += real(conj(val)*(val));
#else
              sum1+= val*val;
#endif
            }
          }
          for( q=ai+gs->rlensb[p]; q < baij->i[p+1]; ++q ) {
            v =  aa+q*bs2;
            for (r=0; r<bs2; r++) {
              val = v[r];
#if defined(USE_PETSC_COMPLEX)
              sum1 += real(conj(val)*(val));
#else
              sum1+= val*val;
#endif
            }
          }
        }
        ct1 = gs->nzu + gs->nzl;
        ierr = MatNorm(pmat,NORM_FROBENIUS,&norm); CHKERRQ(ierr);
        PLogInfo(mat,"PCSetUp_BJacobi_BAIJ:Percent non-zeros thrown away= %4.2f%%, Norm ratio =  %4.2f%% \n", ct1*100.0/baij->nz, sqrt(sum1)*100/norm);
      }
#endif
    } 

    start = 0;
    for (i=0; i<n_local; i++) {
      ierr = SLESCreate(PETSC_COMM_SELF,&sles); CHKERRQ(ierr);
      PLogObjectParent(pc,sles);
      ierr = SLESGetKSP(sles,&subksp); CHKERRQ(ierr);
      ierr = KSPSetType(subksp,KSPPREONLY); CHKERRQ(ierr);
      ierr = SLESGetPC(sles,&subpc); CHKERRQ(ierr);
      ierr = PCSetType(subpc,PCILU); CHKERRQ(ierr);
      ierr = PCGetOptionsPrefix(pc,&prefix); CHKERRQ(ierr);
      ierr = SLESSetOptionsPrefix(sles,prefix); CHKERRQ(ierr);
      ierr = SLESAppendOptionsPrefix(sles,"sub_"); CHKERRQ(ierr);
      ierr = SLESSetFromOptions(sles); CHKERRQ(ierr);

      m = jac->l_lens[i];

      ierr = VecCreateSeq(PETSC_COMM_SELF,m,&x); CHKERRQ(ierr);
      ierr = VecDuplicate(x,&y); CHKERRQ(ierr);
      PLogObjectParent(pc,x);
      PLogObjectParent(pc,y);
      bjac->x[i]      = x;
      bjac->y[i]      = y;
      bjac->starts[i] = start;
      jac->sles[i]    = sles;

      ierr = ISCreateStride(PETSC_COMM_SELF,m,start,1,&is); CHKERRQ(ierr);
      bjac->is[i] = is;
      PLogObjectParent(pc,is);

      start += m;
    }
  } else {
    bjac = (PC_BJacobi_BAIJ *) jac->data;
    /* 
       Destroy the blocks from the previous iteration
    */
    if (pc->flag == DIFFERENT_NONZERO_PATTERN) {
      ierr = MatDestroyMatrices(n_local,&bjac->pmat); CHKERRQ(ierr);
      if (jac->use_true_local) {
        ierr = MatDestroyMatrices(n_local,&bjac->mat); CHKERRQ(ierr);
      }
      scall = MAT_INITIAL_MATRIX;
    }
  }

  ierr = MatGetSubMatrices(pmat,n_local,bjac->is,bjac->is,scall,&bjac->pmat);CHKERRQ(ierr);
  if (jac->use_true_local) {
    ierr = MatGetSubMatrices(mat,n_local,bjac->is,bjac->is,scall,&bjac->mat); CHKERRQ(ierr); 
  }
 
 /* Return control to the user so that the submatrices can be modified (e.g., to apply
     different boundary conditions for the submatrices than for the global problem) */
  ierr = PCModifySubMatrices(pc,n_local,bjac->is,bjac->is,bjac->pmat,pc->modifysubmatricesP); CHKERRQ(ierr);
  for ( i=0; i<n_local; i++ ) {
    PLogObjectParent(pc,bjac->pmat[i]);
    if (jac->use_true_local) {
      PLogObjectParent(pc,bjac->mat[i]);
      ierr = SLESSetOperators(jac->sles[i],bjac->mat[i],bjac->pmat[i],pc->flag);CHKERRQ(ierr);
    } else {
      ierr = SLESSetOperators(jac->sles[i],bjac->pmat[i],bjac->pmat[i],pc->flag);CHKERRQ(ierr);
    }
  }
  PetscFunctionReturn(0);
}






