/**************************************************************************
*                                                                         *
*  Author      : Dr. Thomas Brandes, GMD, SCAI.LAB                        *
*  Copyright   : GMD St. Augustin, Germany                                *
*  Date        : Oct 95                                                   *
*  Last Update : May 96                                                   *
*                                                                         *
*  This Module is part of the DALIB                                       *
*                                                                         *
*  Module      : matmul.m4                                                *
*                                                                         *
*  Function    : Fortran 90 intrinsic MATMUL with distributed arrays      *
*                                                                         *
*  Export :  FORTRAN Interface                                            *
*                                                                         *
**************************************************************************/

#define CHECK
#undef DEBUG

#include "dalib.h"

/**************************************************************************
*                                                                         *
*  GLOBAL DATA                                                            *
*                                                                         *
**************************************************************************/

static void (*dot_prod) ();
int    nbytes;

/**************************************************************************
*                                                                         *
*  DOTPRODUCT  routines                                                   *
*                                                                         *
*    dalib_matmul_dot_float    (c, n, a, a_inc, b)                        *
*    dalib_matmul_dot_int      (c, n, a, a_inc, b)                        *
*    dalib_matmul_dot_double   (c, n, a, a_inc, b)                        *
*    dalib_matmul_dot_bool     (c, n, a, a_inc, b)                        *
*    dalib_matmul_dot_complex  (c, n, a, a_inc, b)                        *
*    dalib_matmul_dot_dcomplex (c, n, a, a_inc, b)                        *
*                                                                         *
**************************************************************************/

void dalib_matmul_dot_float (c, n, a, a_inc, b)
REAL *c, *a, *b;
int   n, a_inc;

{ int i;
  *c = 0.0;
  for (i=0; i<n; i++, a+=a_inc, b++) 
    { /* printf ("%d: add %d: %f * %f\n", pcb.i, i, *a, *b); */
      *c += *a * *b;
    }
  /* printf ("%d: result is %f\n", pcb.i, *c);  */
} /* dalib_matmul_dot_float */

void dalib_matmul_dot_int (c, n, a, a_inc, b)
INTEGER *c, *a, *b;
int n, a_inc;

{ int i;
  *c = 0;
  for (i=0; i<n; i++, a+=a_inc, b++) *c += *a * *b;
} /* dalib_matmul_dot_int */

void dalib_matmul_dot_double (c, n, a, a_inc, b)
DOUBLE_PRECISION *c, *a, *b;
int    n, a_inc;

{ int i;
  *c = 0.0;
  for (i=0; i<n; i++, a+=a_inc, b++) *c += *a * *b;
} /* dalib_matmul_dot_double */

void dalib_matmul_dot_complex (c, n, a, a_inc, b)
REAL *c, *a, *b;
int  n, a_inc;

{ int i;
  c[0] = 0.0;
  c[1] = 0.0;
  for (i=0; i<n; i++, a+=2*a_inc, b+=2) 
    { c[0] += a[0]*b[0] - a[1]*b[1];
      c[1] += a[0]*b[1] + a[1]*b[0];
    }
} /* dalib_matmul_dot_complex */

void dalib_matmul_dot_dcomplex (c, n, a, a_inc, b)

DOUBLE_PRECISION *c, *a, *b;
int    n, a_inc;
 
{ int i;
  c[0] = 0.0;
  c[1] = 0.0;
  for (i=0; i<n; i++, a+=2*a_inc, b+=2)
    { c[0] += a[0]*b[0] - a[1]*b[1];
      c[1] += a[0]*b[1] + a[1]*b[0];
    }
} /* dalib_matmul_dot_dcomplex */

void dalib_matmul_dot_bool (c, n, a, a_inc, b)
INTEGER *c, *a, *b;
int n, a_inc;

{ int i;
  *c = 0;
  for (i=0; i<n; i++, a+=a_inc, b++)
     if (*a && *b) *c = 1;
} /* dalib_matmul_dot_bool */

/**************************************************************************
*                                                                         *
*   dalib_check_mat_shape (c_n1, c_n2, a_n1, a_n2, b_n1, b_n2)            *
*                                                                         *
*   C(c_n1, c_n2) = matmul (A(a_n1, a_n2), B(b_n1, b_n2))                 *
*                                                                         *
**************************************************************************/

static void dalib_check_mat_shape (c_n1, c_n2, a_n1, a_n2, b_n1, b_n2)
int c_n1, c_n2, a_n1, a_n2, b_n1, b_n2;

{ if ((c_n1 != a_n1) || (c_n2 != b_n2) || (a_n2 != b_n1))
     { dalib_internal_error ("wrong shapes in MATMUL");
       printf ("C(%d,%d) = A(%d,%d) o B(%d,%d)\n",
                c_n1, c_n2, a_n1, a_n2, b_n1, b_n2);
       dalib_stop ();
     }
#ifdef DEBUG
       printf ("%d: MATMUL of C(%d,%d) = A(%d,%d) o B(%d,%d)\n",
                pcb.i, c_n1, c_n2, a_n1, a_n2, b_n1, b_n2);
#endif

} /* dalib_check_mat_shape */

/**************************************************************************
*                                                                         *
*                                                                         *
**************************************************************************/

static void dalib_address2 (a_dsp, a_data, a_inc1, a_inc2)

array_info    a_dsp;
unsigned char **a_data;
int           *a_inc1;
int           *a_inc2;

{ int total[MAX_DIMENSIONS+1];
  int first, low;
  unsigned char *data;
  DimInfo *array_dim;
  int array_size;

  array_size = a_dsp->size;

  dalib_array_addressing (a_dsp, pcb.i, &data, &first, total);

       /************************************************
       *                                               *
       *    A(i1,i2) = a_data + i1 * 1 + i2 * total[1] *
       *                                               *
       ************************************************/

  array_dim = a_dsp->dimensions;
  low       = array_dim->local_size[0];
  array_dim++;
  low       = low + array_dim->local_size[0] * total[1];

  data     += (low - first) * nbytes;

  *a_data  = data;
  *a_inc1  = total[0];
  *a_inc2  = total[1];

} /* dalib_address2 */

static void dalib_address1 (a_dsp, a_data, a_inc1)

array_info     a_dsp;
unsigned char  **a_data;
int            *a_inc1;

{ int total[MAX_DIMENSIONS+1];
  int first, low;
  unsigned char *data;
  DimInfo *array_dim;
  int array_size;

  array_size = a_dsp->size;

  dalib_array_addressing (a_dsp, pcb.i, &data, &first, total);

       /************************************************
       *                                               *
       *    A(i1,i2) = a_data + i1 * 1 + i2 * total[1] *
       *                                               *
       ************************************************/

  array_dim = a_dsp->dimensions;
  low       = array_dim->local_size[0];

  data     += (low - first) * nbytes;

  *a_data  = data;
  *a_inc1  = total[0];

} /* dalib_address1 */

/**************************************************************************
*                                                                         *
*         SUBROUTINE MULT22 (c, a, b)                                     *
*         REAL a(:,:), b(:,:), c(:,:)                                     *
*   !hpf$ distribute c (block,block)                                      *
*   !hpf$ align a(i,j) with c(i,*)                                        *
*   !hpf$ align b(i,j) with c(*,j)                                        *
*                                                                         *
*   !hpf$ independent, local_access                                       *
*         do i = 1, ubound (c,1)                                          *
*   !hpf$ independent, local_access                                       *
*           do j = 1, ubound (c,2)                                        *
*              v = 0.0                                                    *
*              do k = 1, ubound(a,2)                                      *
*                 v = v + a(i,k) *  b(k,j)                                *
*              end do                                                     *
*              c(i,j) = v                                                 *
*           end do                                                        *
*         end do                                                          *
*                                                                         *
**************************************************************************/

static void dalib_matmul22 (c_ddsp, a_ddsp, b_ddsp)

array_info a_ddsp, b_ddsp, c_ddsp;

{ array_info c_dsp, b_dsp, a_dsp, dalib_array_new_dsp ();

  int rank, size;
  int top_id, type;
  int shape[MAX_DIMENSIONS];
  int lb1, lb2;
  int copy_flag;

  int c_n1, c_n2;   /* p owns local part of c_n1 * c_n2 */

  int a_1, a_2, b_1, b_2, c_1, c_2;  /* global sizes of arrays */

  int dim1, dim2;
  int type1, type2;

  int base, stride;
  int i, j;

  unsigned char *a, *b, *c;    /* pointer to the local data */
  unsigned char *a_data, *b_data, *c_data;
  int  a_inc2;
  int  b_inc2;
  int  c_inc2;
  int dummy;

  size = nbytes;

  c_dsp = dalib_array_new_dsp (2, size);
  b_dsp = dalib_array_new_dsp (2, size);
  a_dsp = dalib_array_new_dsp (2, size);

  /* define the distributions of the new descriptors */

  top_id = 2; type = kBLOCK_DIM; size = 0;

  FUNCTION(dalib_distribute) (&c_dsp, &top_id, &type, &size, &dummy, &dummy, 
                                               &type, &size, &dummy, &dummy); 

  dim1 = 1; dim2 = 2;

  type1 = kSERIAL_DIM; type2 = kALIGNED_DIM;
  base = 0; stride = 1;

  FUNCTION(dalib_align_source) (&b_dsp, &c_dsp, &type1, &dim1, &base, &stride, 
                                                &type2, &dim2, &base, &stride);

  type1 = kREPLICATED_DIM;
  type2 = kSOURCE_DIM;

  FUNCTION(dalib_align_target) (&b_dsp, &c_dsp, &type1, &dim1, &type2, &dim2); 

  type1 = kALIGNED_DIM;
  type2 = kSERIAL_DIM;

  FUNCTION(dalib_align_source) (&a_dsp, &c_dsp, &type1, &dim1, &base, &stride, 
                                                &type2, &dim2, &base, &stride);

  type1 = kSOURCE_DIM;
  type2 = kREPLICATED_DIM;

  FUNCTION(dalib_align_target) (&a_dsp, &c_dsp, &type1, &dim1, &type2, &dim2); 

  /* now inherit the size to the new descriptors */

  dalib_secarray_shape (&c_ddsp, &rank, shape);
  c_1 = shape[0]; c_2 = shape[1];

  dalib_secarray_shape (&b_ddsp, &rank, shape);
  b_1 = shape[0]; b_2 = shape[1];

  dalib_secarray_shape (&a_ddsp, &rank, shape);
  a_1 = shape[0]; a_2 = shape[1];

  lb1 = 1; lb2 = 1;

  FUNCTION(dalib_array_define) (&c_dsp, &lb1, &c_1, &lb2, &c_2); 
  FUNCTION(dalib_array_define) (&b_dsp, &lb1, &b_1, &lb2, &b_2); 
  FUNCTION(dalib_array_define) (&a_dsp, &lb1, &a_1, &lb2, &a_2); 

  /* verify : c_1 == a_1, c_2 == b_2, a_2 == b_1  */

  dalib_check_mat_shape (c_1, c_2, a_1, a_2, b_1, b_2);

  FUNCTION(dalib_array_query) (&c_dsp, &dim1, &lb1, &c_n1, &dummy, &dummy); 
  c_n1 -= lb1 - 1;

  FUNCTION(dalib_array_query) (&c_dsp, &dim2, &lb2, &c_n2, &dummy, &dummy); 
  c_n2 -= lb2 - 1;

#ifdef DEBUG
  printf ("%d: local size of c : %d x %d \n", pcb.i, c_n1, c_n2);
#endif

  copy_flag = 1; 
  FUNCTION(dalib_copy_in) (&a_dsp, &a_ddsp, (char *) 0, &copy_flag); 
  FUNCTION(dalib_copy_in) (&b_dsp, &b_ddsp, (char *) 0, &copy_flag); 

  copy_flag = 0; 
  FUNCTION(dalib_copy_in) (&c_dsp, &c_ddsp, (char *) 0, &copy_flag); 

  /* now we compute addresses for the data */

  c    = c_dsp->data;
  size = c_dsp->size;   /* number of bytes for one element */

  /* local sizes :   c(c_low1:c_high1,c_low2:c_high2)
                     a(c_low1:c_high1,1:a2)
                     b(1:b1,          c_low2:c_high2)

  */

  dalib_address2 (b_dsp, &b_data, &dummy, &b_inc2);
  dalib_address2 (c_dsp, &c_data, &dummy, &c_inc2);
  dalib_address2 (a_dsp, &a_data, &dummy, &a_inc2);

  /* currently we assume that a_inc1, b_inc1, c_inc1 are equal 1 */

  for (j=0; j < c_n2; j++)

    { b = b_data + j * b_inc2 * size;   /* b points to b(:,j) */
      c = c_data + j * c_inc2 * size;   /* c points to c(:,j) */
      a = a_data;

      for (i=0; i < c_n1; i++)

        { /* a -> a(i,:) i-th row,  b -> b(:,j) j-th column     */

#ifdef DEBUG
          printf ("compute c(%d,%d)\n", i, j);
#endif

          dot_prod (c, a_2, a, a_inc2, b);

          c += size;
          a += size;
        }
    }

   copy_flag = 0; FUNCTION(dalib_copy_out) (&a_dsp, &a_ddsp, &copy_flag); 
   copy_flag = 0; FUNCTION(dalib_copy_out) (&b_dsp, &b_ddsp, &copy_flag); 
   copy_flag = 1; FUNCTION(dalib_copy_out) (&c_dsp, &c_ddsp, &copy_flag); 

} /* dalib_matmul22 */

/**************************************************************************
*                                                                         *
*         SUBROUTINE MULT21 (c, a, b)                                     *
*         REAL a(:,:), b(:), c(:)                                         *
*   !hpf$ distribute c (block)                                            *
*   !hpf$ align a(i,j) with c(i)                                          *
*   !hpf$ align b(j) with c(*)                                            *
*                                                                         *
*   !hpf$ independent, local_access                                       *
*         do i = 1, ubound (c,1)                                          *
*              v = 0.0                                                    *
*              do k = 1, ubound(a,2)                                      *
*                 v = v + a(i,k) *  b(k)                                  *
*              end do                                                     *
*              c(i) = v                                                   *
*         end do                                                          *
*                                                                         *
**************************************************************************/

static void dalib_matmul21 (c_ddsp, a_ddsp, b_ddsp)

array_info a_ddsp, b_ddsp, c_ddsp;

{ array_info c_dsp, b_dsp, a_dsp, dalib_array_new_dsp();

  int size;
  int top_id, type;
  int lb1, lb2;
  int shape[MAX_DIMENSIONS];
  int copy_flag;

  int c_n1;             /* p owns local part of c_n1 */

  int a_1, a_2, b_1, c_1;     /* global sizes of arrays */

  int dim1, dim2;
  int type1, type2;

  int base, stride;
  int i;

  unsigned char *a, *b, *c;    /* pointer to the local data */
  int a_inc2;
  int dummy;
  int rank;

  c_dsp = dalib_array_new_dsp (1, nbytes);
  b_dsp = dalib_array_new_dsp (1, nbytes);
  a_dsp = dalib_array_new_dsp (2, nbytes);

  /* define distributions

     !HPF$ DISTRIBUTE C (BLOCK) */

  top_id = 1; type = kBLOCK_DIM; size = 0;
  FUNCTION(dalib_distribute) (&c_dsp, &top_id, &type, &size, &dummy, &dummy); 

  dim1 = 1; dim2 = 2;

  /* !HPF$ ALIGN B(*) WITH C(*) */

  type1 = kSERIAL_DIM; 
  base = 0; stride = 1;

  FUNCTION(dalib_align_source) (&b_dsp, &c_dsp, &type1, &dim1, &base, &stride); 

  type1 = kREPLICATED_DIM;
  FUNCTION(dalib_align_target) (&b_dsp, &c_dsp, &type1, &dim1); 

  /* HPF$ align A(I,*) with C(I) */

  type1 = kALIGNED_DIM;
  type2 = kSERIAL_DIM;
  FUNCTION(dalib_align_source) (&a_dsp, &c_dsp, &type1, &dim1, &base, &stride, 
                                                &type2, &dim2, &base, &stride);

  type1 = kSOURCE_DIM;
  FUNCTION(dalib_align_target) (&a_dsp, &c_dsp, &type1, &dim1); 

  /* inherit sizes */


  dalib_secarray_shape (&c_ddsp, &rank, shape);
  c_1 = shape[0]; 

  dalib_secarray_shape (&b_ddsp, &rank, shape);
  b_1 = shape[0];

  dalib_secarray_shape (&a_ddsp, &rank, shape);
  a_1 = shape[0]; a_2 = shape[1];

  lb1 = 1; lb2 = 1;

  FUNCTION(dalib_array_define) (&c_dsp, &lb1, &c_1);
  FUNCTION(dalib_array_define) (&b_dsp, &lb1, &b_1);
  FUNCTION(dalib_array_define) (&a_dsp, &lb1, &a_1, &lb2, &a_2);
 
  /* verify : c_1 == a_1, a_2 == b_1  */

  dalib_check_mat_shape (c_1, 1, a_1, a_2, b_1, 1);

  FUNCTION(dalib_array_query) (&c_dsp, &dim1, &lb1, &c_n1, &dummy, &dummy); 
  c_n1 -= lb1 - 1;

#ifdef DEBUG
  printf ("%d: local size of c : %d\n", pcb.i, c_n1);
#endif

  copy_flag = 1; 
  FUNCTION(dalib_copy_in) (&a_dsp, &a_ddsp, (char *) 0, &copy_flag); 
  FUNCTION(dalib_copy_in) (&b_dsp, &b_ddsp, (char *) 0, &copy_flag); 

  copy_flag = 0; 
  FUNCTION(dalib_copy_in) (&c_dsp, &c_ddsp, (char *) 0, &copy_flag); 

  /* now we compute addresses for the data */

  c    = c_dsp->data;
  size = c_dsp->size;   /* number of bytes for one element */

  /* local sizes :   c(c_low1:c_high1)
                     a(c_low1:c_high1,1:a2)
                     b(1:b1)

  */

  dalib_address1 (b_dsp, &b, &dummy);
  dalib_address1 (c_dsp, &c, &dummy);
  dalib_address2 (a_dsp, &a, &dummy, &a_inc2);

  for (i=0; i < c_n1; i++)
     { /* a -> a(i,:) i-th row,  b -> b(:) j-th column     */

#ifdef DEBUG
       printf ("compute c(%d)\n", i);
#endif 

       dot_prod (c, a_2, a, a_inc2, b);

       c += size;
       a += size;
    }

   copy_flag = 0; FUNCTION(dalib_copy_out) (&a_dsp, &a_ddsp, &copy_flag); 
   copy_flag = 0; FUNCTION(dalib_copy_out) (&b_dsp, &b_ddsp, &copy_flag); 
   copy_flag = 1; FUNCTION(dalib_copy_out) (&c_dsp, &c_ddsp, &copy_flag); 

} /* dalib_matmul21 */

/**************************************************************************
*                                                                         *
*         SUBROUTINE MULT12 (c, a, b)                                     *
*         REAL a(:), b(:,:), c(:)                                         *
*   !hpf$ distribute c (block)                                            *
*   !hpf$ align a(j) with c(*)                                            *
*   !hpf$ align b(i,j) with c(j)                                          *
*                                                                         *
*   !hpf$ independent, local_access                                       *
*         do j = 1, ubound (c,1)                                          *
*            v = 0.0                                                      *
*            do k = 1, ubound(a,1)                                        *
*               v = v + a(k) *  b(k,j)                                    *
*            end do                                                       *
*            c(j) = v                                                     *
*         end do                                                          *
*                                                                         *
**************************************************************************/

static void dalib_matmul12 (c_ddsp, a_ddsp, b_ddsp)

array_info a_ddsp, b_ddsp, c_ddsp;

{ array_info c_dsp, b_dsp, a_dsp, dalib_array_new_dsp ();

  int size;
  int top_id, type;
  int lb1, lb2;
  int shape[MAX_DIMENSIONS];
  int copy_flag;

  int  c_n2;   /* p owns local part of size c_n2 */

  int a_1, a_2, b_1, b_2, c_1, c_2;  /* global sizes of arrays */

  int dim1, dim2;
  int type1, type2;

  int base, stride;
  int j;

  unsigned char *a, *b, *c;    /* pointer to the local data */
  int b_inc2;
  int dummy;
  int rank;

  size = nbytes;

  c_dsp = dalib_array_new_dsp (1, nbytes);
  a_dsp = dalib_array_new_dsp (1, nbytes);

  b_dsp = dalib_array_new_dsp (2, nbytes);

  /* define distributions */

  dim1 = 1; dim2 = 2;

  top_id = 1; type = kBLOCK_DIM;; size = 0;
  FUNCTION(dalib_distribute) (&c_dsp, &top_id, &type, &size, &dummy, &dummy); 

  /* HPF$ ALIGN B(*,J) WITH C(J) */

  type1 = kSERIAL_DIM; type2 = kALIGNED_DIM;
  base  = 0; stride = 1;

  FUNCTION(dalib_align_source) (&b_dsp, &c_dsp, &type1, &dim1, &base, &stride, 
                                                &type2, &dim1, &base, &stride);

  type2 = kSOURCE_DIM;

  FUNCTION(dalib_align_target) (&b_dsp, &c_dsp, &type2, &dim2); 

  type1 = kSERIAL_DIM;

  FUNCTION(dalib_align_source) (&a_dsp, &c_dsp, &type1, &dim1, &base, &stride); 

  type1 = kREPLICATED_DIM;

  FUNCTION(dalib_align_target) (&a_dsp, &c_dsp, &type1, &dim1); 

  /* Inherit sizes */
 
  lb1 = 1; lb2 = 1;

  dalib_secarray_shape (&c_ddsp, &rank, shape);
  c_2 = shape[0]; 

  dalib_secarray_shape (&b_ddsp, &rank, shape);
  b_1 = shape[0]; b_2 = shape[1];

  dalib_secarray_shape (&a_ddsp, &rank, shape);
  a_2 = shape[0]; 

  /* important: call of define of template at first */

  FUNCTION(dalib_array_define) (&c_dsp, &lb2, &c_2);
  FUNCTION(dalib_array_define) (&b_dsp, &lb1, &b_1, &lb2, &b_2);
  FUNCTION(dalib_array_define) (&a_dsp, &lb2, &a_2);
 
  /* verify : c_2 == b_2, a_2 == b_1  */

  dalib_check_mat_shape (1, c_2, 1, a_2, b_1, b_2);

  FUNCTION(dalib_array_query) (&c_dsp, &dim1, &lb2, &c_n2, &dummy, &dummy); 
  c_n2 -= lb2 - 1;

#ifdef DEBUG
  printf ("%d: local size of c : %d \n", pcb.i, c_n2);
#endif

  copy_flag = 1; 
  FUNCTION(dalib_copy_in) (&a_dsp, &a_ddsp, (char *) 0, &copy_flag); 
  FUNCTION(dalib_copy_in) (&b_dsp, &b_ddsp, (char *) 0, &copy_flag); 
  copy_flag = 0; 
  FUNCTION(dalib_copy_in) (&c_dsp, &c_ddsp, (char *) 0, &copy_flag); 

  /* now we compute addresses for the data */

  c    = c_dsp->data;
  size = c_dsp->size;   /* number of bytes for one element */

  /* local sizes :   c(c_low2:c_high2)
                     a(1:a2)
                     b(1:b1,          c_low2:c_high2)

  */

  dalib_address1 (a_dsp, &a, &dummy);
  dalib_address1 (c_dsp, &c, &dummy);
  dalib_address2 (b_dsp, &b, &dummy, &b_inc2);

  for (j=0; j < c_n2; j++)

    { /* a -> a(i,:) i-th row,  b -> b(:,j) j-th column     */

#ifdef DEBUG
      printf ("compute c(%d)\n", j);
#endif

      dot_prod (c, a_2, a, 1, b);

      b += b_inc2 * size;   /* set b to next column */
      c += size;
    }

   copy_flag = 0; FUNCTION(dalib_copy_out) (&a_dsp, &a_ddsp, &copy_flag); 
   copy_flag = 0; FUNCTION(dalib_copy_out) (&b_dsp, &b_ddsp, &copy_flag); 
   copy_flag = 1; FUNCTION(dalib_copy_out) (&c_dsp, &c_ddsp, &copy_flag); 

} /* dalib_matmul12 */

/**************************************************************************
*                                                                         *
*   void dalib_get_info (array_info = int rank, int size)                 *
*                                                                         *
**************************************************************************/

static void dalib_get_info (a_dsp, a_rank, a_size)
array_info a_dsp;
int *a_rank, *a_size;

{ section_info s_dsp;

  if (dalib_is_array_info (a_dsp))
     { *a_rank = a_dsp->rank;
       *a_size = a_dsp->size;
     }
   else if (dalib_is_section_info (a_dsp))
     { *a_rank = dalib_section_rank (a_dsp);
       s_dsp   = (section_info) a_dsp;
       *a_size = s_dsp->array_id->size;
     }
   else
     { dalib_internal_error ("not array/section in MATMUL");
       dalib_stop ();
     }
} /* dalib_get_rank */

/**************************************************************************
*                                                                         *
*  FORTRAN Interface                                                      *
*                                                                         *
* FUNCTION(dalib_matmul) (op, c_data, a_data, b_data, * 
*                     matrix_c, matrix_a, matrix_b)                       *
*                                                                         *
**************************************************************************/

void FUNCTION(dalib_matmul) (op, d1, d2, d3, matrix_c_dsp, matrix_a_dsp, matrix_b_dsp) 

int        *op;               /* specifies kind of multiplication/sum */
array_info *matrix_a_dsp;
array_info *matrix_b_dsp;
array_info *matrix_c_dsp;
char       *d1, *d2, *d3;

{ int rank_a, rank_b, rank_c;
  int size_a, size_b, size_c;
  int size;

  dalib_get_info (*matrix_a_dsp, &rank_a, &size_a);
  dalib_get_info (*matrix_b_dsp, &rank_b, &size_b);
  dalib_get_info (*matrix_c_dsp, &rank_c, &size_c);

  switch (*op) {

   case 7  : size= sizeof(INTEGER);     
             dot_prod= dalib_matmul_dot_int; break;
   case 8  : size= sizeof(REAL);   
             dot_prod= dalib_matmul_dot_float; break;
   case 9  : size= sizeof(DOUBLE_PRECISION);  
             dot_prod= dalib_matmul_dot_double; break;
   case 17 : size= sizeof(INTEGER);     
             dot_prod= dalib_matmul_dot_bool; break;
   case 19 : size= 2*sizeof(REAL); 
             dot_prod= dalib_matmul_dot_complex; break;

   default : dalib_internal_error ("MATMUL illegal operation");
             printf ("operation = %d\n", *op);
             dalib_stop ();
             break;
   }

  if ((size != size_a) || (size != size_b) || (size != size_c))
     { dalib_internal_error ("MATMUL: arrays have different types");
       dalib_stop ();
     }

#ifdef DEBUG
  printf ("%d: MATMUL C(rank=%d) = A(rank=%d) o B (rank=%d), size = %d\n",
           pcb.i, rank_c, rank_a, rank_b, size_a);
#endif 

  nbytes = size;

  if (rank_a == 1)
     dalib_matmul12 (*matrix_c_dsp, *matrix_a_dsp, *matrix_b_dsp);
   else if (rank_b == 1)
     dalib_matmul21 (*matrix_c_dsp, *matrix_a_dsp, *matrix_b_dsp);
   else
     dalib_matmul22 (*matrix_c_dsp, *matrix_a_dsp, *matrix_b_dsp);

} /* FUNCTION(dalib_matmul) */ 
