/*
 * This file is part of the Pablo Performance Analysis Environment
 *
 *                                           TM
 * The Pablo Performance Analysis Environment   software is *not* in
 * the public domain.  However, it is freely available without fee for
 * education, research, and non-profit purposes.  By obtaining copies
 * of this and other files that comprise the Pablo Performance Analysis
 * Environment, you, the Licensee, agree to abide by the following
 * conditions and understandings with respect to the copyrighted software:
 * 
 * 1.  The software is copyrighted in the name of the Board of Trustees
 *     of the University of Illinois (UI), and ownership of the software
 *     remains with the UI. 
 *
 * 2.  Permission to use, copy, and modify this software and its documentation
 *     for education, research, and non-profit purposes is hereby granted
 *     to Licensee, provided that the copyright notice, the original author's
 *     names and unit identification, and this permission notice appear on
 *     all such copies, and that no charge be made for such copies.  Any
 *     entity desiring permission to incorporate this software into commercial
 *     products should contact:
 *
 *          Professor Daniel A. Reed                 reed@cs.uiuc.edu
 *          University of Illinois
 *          Department of Computer Science
 *          2413 Digital Computer Laboratory
 *          1304 West Springfield Avenue *          Urbana, Illinois  61801
 *          USA
 *
 * 3.  Licensee may not use the name, logo, or any other symbol of the UI
 *     nor the names of any of its employees nor any adaptation thereof in
 *     advertizing or publicity pertaining to the software without specific
 *     prior written approval of the UI.
 *
 * 4.  THE UI MAKES NO REPRESENTATIONS ABOUT THE SUITABILITY OF THE
 *     SOFTWARE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS
 *     OR IMPLIED WARRANTY.
 *
 * 5.  The UI shall not be liable for any damages suffered by Licensee from
 *     the use of this software.
 *
 * 6.  The software was developed under agreements between the UI and the
 *     Federal Government which entitle the Government to certain rights.
 *
 **************************************************************************
 *
 * Developed by: The TAPESTRY Parallel Computing Laboratory
 *		 University of Illinois at Urbana-Champaign
 *		 Department of Computer Science
 *		 1304 W. Springfield Avenue
 *		 Urbana, IL	61801
 *
 * Copyright (c) 1987-1994
 * The University of Illinois Board of Trustees.
 *	All Rights Reserved.
 *
 * Author: Bradley W. Schwartz (schwartz@cs.uiuc.edu)
 * Project Manager and Principal Investigator:
 *	Daniel A. Reed (reed@cs.uiuc.edu)
 *
 * Funded by: National Science Foundation grants NSF CCR86-57696,
 * NSF CCR87-06653 and NSF CDA87-22836 (Tapestry), NASA ICLASS Contract
 * No. NAG-1-613, DARPA Contract No. DABT63-91-K-0004, by a grant
 * from the Digital Equipment Corporation External Research Program,
 * and by a collaborative research agreement with the Intel Supercomputer
 * Systems Division.
 *
 *
*/

#include "SquareErrorCluster.h"
#include <limits.h>
#include <math.h>
#include <stdlib.h>

#define Square(x)          ( (x) * (x) )
#define MAX_ITERATIONS     100

extern "C" {
extern int rand();
}


SquareErrorCluster::SquareErrorCluster()
{
     initialClusterPolicy = ICP_RANDOM;
     migrationPolicy = MP_CASE_CLOSEST;
     clusterCentroids = NULL;
     clusterCounts = NULL;
     squareErrors = NULL;
}



SquareErrorCluster::~SquareErrorCluster()
{
}


void
SquareErrorCluster::_allocateDataStructures()
{
     // Centroids
     if (clusterCentroids == NULL ) {
        clusterCentroids = (double *)malloc( numberOfIntendedClusters *
					     dimension * sizeof(double));
	// These are initialized to 0 at each cluster iteration
     } else {
        clusterCentroids = (double *)realloc( clusterCentroids,
                 numberOfIntendedClusters * dimension * sizeof(double) );
     }

     // Centroid counts
     if (clusterCounts == NULL ) {
        clusterCounts = (int *)malloc( numberOfIntendedClusters * sizeof(int));
	// These are initialized to 0 at each cluster iteration

     } else {
        clusterCounts = (int *)realloc( clusterCounts,
                                       numberOfIntendedClusters * sizeof(int));
     }

     /* Square-error per cluster */
     if (squareErrors == NULL ) {
        squareErrors = (double *)malloc( numberOfIntendedClusters *
					 sizeof(double) );
	// These are initialized to 0 at each cluster iteration

     } else {
        squareErrors = (double *)realloc( squareErrors,
			       numberOfIntendedClusters * sizeof(double) );
     }
}


void
SquareErrorCluster::_computeClusterCentroid( int whichCluster )
{
     clusterCounts[whichCluster] = 0;

     // Initialize the coordinates to 0
     int i;
     for (i=0; i<dimension; i++) {
         clusterCentroids[whichCluster*dimension + i] = 0.0;
     }

     // Find the centroid of this cluster
     int j;
     for (i=0; i< numberOfDataPoints; i++) {
         if ( clustering[i] == whichCluster ) {
            for (j=0; j<dimension; j++) {         
                clusterCentroids[clustering[i]*dimension+j] +=
		                                    data[i*dimension+j];
	    }
            clusterCounts[ clustering[i] ]++;
	 }
     }

     for (j=0; j< dimension; j++) {
         clusterCentroids[whichCluster*dimension+j] /=
	                                    clusterCounts[whichCluster];
     }
}


double
SquareErrorCluster::_computeDistanceToCentroid( int point, int centroid )
{
     double currentDistance = 0.0;
     int i;
     for (i=0; i<dimension; i++ ) {
         currentDistance += Square( (double)(data[point*dimension+i] -
				    clusterCentroids[centroid*dimension+i]) );
     }
     return( sqrt(currentDistance) );
}


double
SquareErrorCluster::_computeSquareError( int whichCluster )
{
     /* Square-error for the clustering is defined as the sum of square
        error per cluster.  Square-error is the same as within-cluster
        variation.  Minimizing within-cluster variation is equivalent to
        maximizing between-cluster variation (Jain and Dubes) */

     // Compute the centroid for this cluster
     _computeClusterCentroid(whichCluster);

     // Then, find the square error for this cluster
     squareErrors[whichCluster] = 0.0;

     double currentEuclideanDistance2ed;
     int i, j;
     for (i=0; i<numberOfDataPoints; i++) {
         if ( clustering[i] == whichCluster ) {
            currentEuclideanDistance2ed = 0.0;
            for (j=0; j<dimension; j++) {
                currentEuclideanDistance2ed = currentEuclideanDistance2ed +
                           Square( data[i*dimension+j] -
			   clusterCentroids[ clustering[i]*dimension + j ] );
	    }
	    squareErrors[ whichCluster ] += currentEuclideanDistance2ed;
	 }
     }
     return( squareErrors[whichCluster] );
}	


                 
double
SquareErrorCluster::_computeSquareError()
{
     double totalSquareError = 0.0;

     int i;
     for (i=0; i < numberOfIntendedClusters; i++) {
         totalSquareError += _computeSquareError( i );
     }
     return( totalSquareError );
}
     


void
SquareErrorCluster::_initialClusterRandomPolicy()
{
     int i;
     Boolean_ allClustersPopulated = FALSE_;
     double ourRandomNumber;

     while ( ! allClustersPopulated ) {        
        for (i=0; i<numberOfDataPoints; i++) {
            ourRandomNumber = (double)rand() / (double)INT_MAX;
	    clustering[i] = (int)(ourRandomNumber * numberOfIntendedClusters);
	    clusterCounts[ clustering[i] ]++;
        }

	// Check to insure that the random clustering covers all clusters
	allClustersPopulated = TRUE_;
	for (i=0; i<numberOfIntendedClusters; i++) {
            if ( clusterCounts[i] == 0 ) {
               allClustersPopulated = FALSE_;
	    }
	}
     }
}


Boolean_
SquareErrorCluster::_migratePoints()
{
     Boolean_ changeMade = FALSE_;

     switch (migrationPolicy) {
          case MP_CASE_CLOSEST:
              changeMade = _migrationCaseClosestPolicy();
	      break;
 	  case MP_CASE_MAXREDUCE:
              changeMade = _migrationMaxReductionPolicy();
	      break;
     	  default:
	      break;
     }
     return( changeMade );
}


Boolean_
SquareErrorCluster::_migrationCaseClosestPolicy()
{
     int i, j;
     double currentPointDistance, minimumPointDistance;
     int bestClusterForPoint;

     for (i=0; i<numberOfDataPoints; i++) {
         minimumPointDistance = _computeDistanceToCentroid( i, 0 );
	 bestClusterForPoint = 0;
// printf("First distance %lf\n", minimumPointDistance );

         for (j=1; j<numberOfIntendedClusters; j++ ) {
	     currentPointDistance = _computeDistanceToCentroid( i, j );
// printf( "Distance %i is %lf\n", j, currentPointDistance );

	     if ( currentPointDistance < minimumPointDistance ) {
                minimumPointDistance = currentPointDistance;
		bestClusterForPoint = j;
	     }
	 }
	 if ( clusterCounts[ clustering[i] ] > 1 ) {
	    clusterCounts[ clustering[i] ]--;
	    clustering[i] = bestClusterForPoint;
	    clusterCounts[ bestClusterForPoint ]++;
	 }
     }
     return( TRUE_ );
}


Boolean_
SquareErrorCluster::_migrationMaxReductionPolicy()
{	
     // Store the initial square errors
     double *originalSquareError = new double[numberOfIntendedClusters];
     int i;
     for (i=0; i<numberOfIntendedClusters; i++) {
         originalSquareError[i] = squareErrors[i];
     }

     // Determine which single point reassignment will most decrease
     // the overall square error
     int oldCluster, newCluster;
     double oldClusterNewError, newClusterNewError;
     double changeInError, maxReduction = 0.0;
     int bestPoint = NONEXISTENT_POINT, bestNewCluster;

     for (i=0; i<numberOfDataPoints; i++) {
         oldCluster = clustering[i]; 
         for (newCluster=0; newCluster<numberOfIntendedClusters; newCluster++){

             if ( ( newCluster != oldCluster ) &&
		                          ( clusterCounts[oldCluster] > 1 ) ) {
                 clustering[i] = newCluster;
		 oldClusterNewError = _computeSquareError(oldCluster);
		 newClusterNewError = _computeSquareError(newCluster);
		 changeInError = (originalSquareError[oldCluster]+
		                  originalSquareError[newCluster] ) -
		                 (oldClusterNewError+newClusterNewError);
		 if ( changeInError > maxReduction ){
                     maxReduction = changeInError;
		     bestPoint = i;
		     bestNewCluster = newCluster;
		 }
		 clustering[i] = oldCluster;
	     }
	 }
     }

     // Reassign the single point, if one existed
     delete originalSquareError;
     if ( bestPoint != NONEXISTENT_POINT ) {
        clustering[bestPoint] = bestNewCluster; 
	return( TRUE_ );
     } else {
        return( FALSE_ );
     }
}
				    

void
SquareErrorCluster::cluster()
{
     // Allocate space for computation-specific data structures
     _allocateDataStructures();

     // Set initial clustering
     switch( initialClusterPolicy ) {
        case ICP_RANDOM:
           _initialClusterRandomPolicy();
           break;       
        default:
           break;
     }
cout << "Clustering " << numberOfDataPoints << " in " << dimension << " dimensions\n";
     // Compute initial square error
     double squareError = _computeSquareError();
     double previousSquareError = squareError + 1.;
     int numberOfIterations = 0;
     Boolean_ progressMade = TRUE_;

// cout << "Original square error " << squareError << " \n";

     // Iterate until termination condition is met 
     while ( (numberOfIterations < MAX_ITERATIONS) && 
	     (squareError < previousSquareError ) && progressMade ) {
	  previousSquareError = squareError;
          progressMade = _migratePoints();
	  squareError = _computeSquareError();
// cout << "Current square error " << squareError << " \n";
          numberOfIterations++;
     }
cout << "Number of iterations executed " << numberOfIterations << " \n";
     numberOfFoundClusters = numberOfIntendedClusters;
}


void
SquareErrorCluster::getClusterReps( int *repList )
{
     double *repDistances = new double[ numberOfFoundClusters ];

     int i, j;
     for (i=0; i < numberOfFoundClusters; i++) {
	 repDistances[i] = _computeDistanceToCentroid( 0, i ); 
	 repList[i] = 0;
     }

     double currentRepDistance;
     for (i=1; i < numberOfDataPoints; i++) {
         for (j=1; j < numberOfFoundClusters; j++) {
	     currentRepDistance = _computeDistanceToCentroid( i, j );
	     if ( currentRepDistance < repDistances[j] ) {
                repDistances[j] = currentRepDistance;
		repList[j] = i;
	     }
	 }
     }
     delete repDistances;
}
     


