/*
 * This file is part of the Pablo Performance Analysis Environment
 *
 *                                           TM
 * The Pablo Performance Analysis Environment   software is *not* in
 * the public domain.  However, it is freely available without fee for
 * education, research, and non-profit purposes.  By obtaining copies
 * of this and other files that comprise the Pablo Performance Analysis
 * Environment, you, the Licensee, agree to abide by the following
 * conditions and understandings with respect to the copyrighted software:
 * 
 * 1.  The software is copyrighted in the name of the Board of Trustees
 *     of the University of Illinois (UI), and ownership of the software
 *     remains with the UI. 
 *
 * 2.  Permission to use, copy, and modify this software and its documentation
 *     for education, research, and non-profit purposes is hereby granted
 *     to Licensee, provided that the copyright notice, the original author's
 *     names and unit identification, and this permission notice appear on
 *     all such copies, and that no charge be made for such copies.  Any
 *     entity desiring permission to incorporate this software into commercial
 *     products should contact:
 *
 *          Professor Daniel A. Reed                 reed@cs.uiuc.edu
 *          University of Illinois
 *          Department of Computer Science
 *          2413 Digital Computer Laboratory
 *          1304 West Springfield Avenue *          Urbana, Illinois  61801
 *          USA
 *
 * 3.  Licensee may not use the name, logo, or any other symbol of the UI
 *     nor the names of any of its employees nor any adaptation thereof in
 *     advertizing or publicity pertaining to the software without specific
 *     prior written approval of the UI.
 *
 * 4.  THE UI MAKES NO REPRESENTATIONS ABOUT THE SUITABILITY OF THE
 *     SOFTWARE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS
 *     OR IMPLIED WARRANTY.
 *
 * 5.  The UI shall not be liable for any damages suffered by Licensee from
 *     the use of this software.
 *
 * 6.  The software was developed under agreements between the UI and the
 *     Federal Government which entitle the Government to certain rights.
 *
 **************************************************************************
 *
 * Developed by: The TAPESTRY Parallel Computing Laboratory
 *		 University of Illinois at Urbana-Champaign
 *		 Department of Computer Science
 *		 1304 W. Springfield Avenue
 *		 Urbana, IL	61801
 *
 * Copyright (c) 1987-1994
 * The University of Illinois Board of Trustees.
 *	All Rights Reserved.
 *
 * Author: Bradley W. Schwartz (schwartz@cs.uiuc.edu)
 * Project Manager and Principal Investigator:
 *	Daniel A. Reed (reed@cs.uiuc.edu)
 *
 * Funded by: National Science Foundation grants NSF CCR86-57696,
 * NSF CCR87-06653 and NSF CDA87-22836 (Tapestry), NASA ICLASS Contract
 * No. NAG-1-613, DARPA Contract No. DABT63-91-K-0004, by a grant
 * from the Digital Equipment Corporation External Research Program,
 * and by a collaborative research agreement with the Intel Supercomputer
 * Systems Division.
 *
 *
*/

#include "SquareErrorClusterVariable.h"
#include <stdlib.h>
#include <math.h>


#define Square(x)    ( (x) * (x) )



SquareErrorClusterVariable::SquareErrorClusterVariable()
{
     clusterCentroidList = NULL;  
     centroidDistances = NULL;
//     replist = NULL;
     cntlist = NULL;
}


SquareErrorClusterVariable::~SquareErrorClusterVariable()
{
}


void
SquareErrorClusterVariable::_addNewCluster( int point )
{
     clustering[point] = currentNumberOfClusters++;
// fprintf(stderr, "CD before %d\n", (int)centroidDistances );
// fprintf(stderr, "Current NC %d Square %d\n", currentNumberOfClusters,
//           Square(currentNumberOfClusters) );

     if (centroidDistances != NULL) {  
        free(centroidDistances);
     }
     centroidDistances = (double *)malloc( 
	                   Square(currentNumberOfClusters) * sizeof(double));
//     centroidDistances =(double *)realloc( centroidDistances,
//                          Square(currentNumberOfClusters) * sizeof(double)); 
// fprintf(stderr, "CD after %d\n", 	(int)centroidDistances );

     ClusterCentroid *lastCentroid = clusterCentroidList;
// fprintf(stderr, "Head is %d\n", (int)lastCentroid);  

     if ( lastCentroid != NULL ) {
        while (lastCentroid->next != NULL ) {
             lastCentroid = (ClusterCentroid *)lastCentroid->next;
        }
     }
// fprintf(stderr, "Last centroid is %d\n", (int)lastCentroid);  

     ClusterCentroid *newCentroid = (ClusterCentroid *)malloc( 
                                               sizeof(ClusterCentroid));
// fprintf(stderr, "New centroid is %d\n", (int)newCentroid );

     newCentroid->centroid = (double *)malloc(dimension *sizeof(double));

     int i;
     for (i=0; i<dimension; i++) {
         newCentroid->centroid[i] = data[point*dimension + i];
     }

     if ( clusterCentroidList == NULL ) {
        clusterCentroidList = newCentroid;
	clusterCentroidList->prev = NULL;
     } else {
        lastCentroid->next = newCentroid;
        newCentroid->prev = lastCentroid;
     }
     newCentroid->count = 1;
     newCentroid->next = NULL;
}

         
void
SquareErrorClusterVariable::_allocateDataStructures()
{
     // Inter-Centroid distances
     centroidDistances = (double *)malloc( sizeof(double) );
}					       



double
SquareErrorClusterVariable::_computeCentroidSpacing( int centroid1,
						     int centroid2 )
{
     ClusterCentroid *centroid1Struct = _findCentroid( centroid1 );
     ClusterCentroid *centroid2Struct = _findCentroid( centroid2 );
/*
fprintf(stderr, "Found %d at %d,%d, %d at %d,%d\n", centroid1,
(int)centroid1Struct, (int)centroid1Struct->centroid,  centroid2,
(int)centroid2Struct, (int)centroid2Struct->centroid );
*/
     double currentDistance = 0.0;
/*
fprintf(stderr, "Coordinates of centroid: %d:(%lf,%lf)\n",
centroid1, centroid1Struct->centroid[0], centroid1Struct->centroid[1] );
fprintf(stderr, "Coordinates of centroid: %d:(%lf,%lf)\n",
centroid2, centroid2Struct->centroid[0], centroid2Struct->centroid[1] );
*/
     int i;
     for (i=0; i<dimension; i++ ) {
         currentDistance += Square(centroid1Struct->centroid[i] -
				   centroid2Struct->centroid[i] );
     }
// fprintf(stderr, "REturning the value\n");
     return( sqrt(currentDistance) );     
}


void
SquareErrorClusterVariable::_computeCentroidSpacings()
{
     int i, j;
     for (i=0; i<currentNumberOfClusters; i++) {
         for (j=i+1; j<currentNumberOfClusters; j++) {

/* fprintf(stderr, "Checking spacings %d and %d over %d\n", i, j,
currentNumberOfClusters );
fprintf(stderr, "Current number of clusters is %d\n",
currentNumberOfClusters );
fprintf(stderr, "CD at assignment %d\n", 	(int)centroidDistances );
*/
	     centroidDistances[i*currentNumberOfClusters+j] =
	                             _computeCentroidSpacing( i, j );

/*
fprintf(stderr, "Distance between cluster %d and %d is %lf\n", i, j,
centroidDistances[i*currentNumberOfClusters+j] );
*/
         }
     }
}


double
SquareErrorClusterVariable::_computeDistanceToCentroid( int point,
						        int centroid )
{
     double currentDistance = 0.0;
     int i;
     ClusterCentroid *currentCentroid = _findCentroid( centroid );
/* fprintf(stderr, "Centroid %d found at %d\n", centroid, (int)currentCentroid); */
     for (i=0; i<dimension; i++ ) {
         currentDistance += Square( data[point*dimension+i] -
				    currentCentroid->centroid[i] );
// fprintf(stderr, "D %d:  data: %lf, centroid: %lf\n", i,
//      data[point*dimension+i], currentCentroid->centroid[i] );
     }
     return( sqrt(currentDistance) );
}
    


void
SquareErrorClusterVariable::_deallocateDataStructures()
{
// fprintf(stderr, "Beginning dealloc\n");
     ClusterCentroid *currentListElement = clusterCentroidList;
     if ( clusterCentroidList != NULL ) {
         while ( currentListElement->next != NULL ) {
/* cout << "cle " << (int)currentListElement << " " << (int)(((ClusterCentroid *)currentListElement->next)->prev) << " \n";
cout << "cle2 " << (int)(currentListElement->next) << " " << (int)(currentListElement->prev) << " \n";

cout << "Trying to free with count of " << currentListElement->count << "...\n";
*/
             currentListElement = (ClusterCentroid *)currentListElement->next;
	     free( ((ClusterCentroid *)(currentListElement->prev))->centroid );
	     free( currentListElement->prev );
         }
// fprintf(stderr, "Done w list\n");
         free( currentListElement->centroid );
	 free( currentListElement );
         clusterCentroidList = NULL;
     }
     if ( centroidDistances != NULL ) {
// fprintf(stderr, "Freeing centroid distances\n");
        free( centroidDistances );
        centroidDistances = NULL;
     }
}


ClusterCentroid *
SquareErrorClusterVariable::_findCentroid( int centroid )
{
     int iterCount = 0;
     ClusterCentroid *centroidCurrent = clusterCentroidList;
     while ( iterCount < centroid ) {
          centroidCurrent = (ClusterCentroid *)centroidCurrent->next;
          iterCount++;
     }          
     return( centroidCurrent );
}


int
SquareErrorClusterVariable::_findCentroidClosestToPoint( int point )
{
     double minDistance, currentDistance;
     minDistance = _computeDistanceToCentroid( point, 0 );
     int bestCluster = 0;

     int i;
     for (i=1; i<currentNumberOfClusters; i++) {
	 currentDistance = _computeDistanceToCentroid( point, i );
	 if ( currentDistance < minDistance ) {
	    minDistance = currentDistance;
	    bestCluster = i;
	 }
     }
// fprintf(stderr, "--- END FIND ---------------\n"); 
     return(bestCluster);
}


void
SquareErrorClusterVariable::_mergeClusters()
{
     Boolean_ clusterMerge = TRUE_;
     int i, j;
     while ( clusterMerge ) {
         clusterMerge = FALSE_;
// fprintf(stderr, "Merge time\n"); 
         for (i=0; i<currentNumberOfClusters; i++) {
            for (j=i+1; j<currentNumberOfClusters; j++) {
// fprintf(stderr, "Checking %d and %d\n", i, j );
	       if ( _computeCentroidSpacing( i, j ) < C ) {
// fprintf(stderr, "Merging clusters %d and %d\n", i, j);
	          _mergeTwoClusters( i, j );
// fprintf(stderr, "Recomputing all cluster centroids\n");
		  _computeCentroidSpacings();
	   	  clusterMerge = TRUE_;
	   	  break;
	       }
	     }
	     if ( clusterMerge ) break;
         }
     }
}


void
SquareErrorClusterVariable::_mergeTwoClusters( int cluster1, int cluster2 )
{
     int i;
     // Reassign the points
     for (i=0; i<numberOfDataPoints; i++) {
         if ( clustering[i] == cluster2 ) {
	    clustering[i] = cluster1;
	 }
     }
     // Recompute the new centroid for the surviving cluster
     ClusterCentroid *centroid1Struct = _findCentroid( cluster1 );
     ClusterCentroid *centroid2Struct = _findCentroid( cluster2 );
     int newCount = centroid1Struct->count + centroid2Struct->count;

     for (i=0; i<dimension; i++) {
         centroid1Struct->centroid[i] =  
		 ((centroid1Struct->centroid[i] * centroid1Struct->count) +
		  (centroid2Struct->centroid[i] * centroid2Struct->count)) /
		 newCount;    
     }
     centroid1Struct->count = newCount;

     // Delete the dead cluster
     ClusterCentroid *doomedCentroid = centroid2Struct;
     ((ClusterCentroid *)(centroid2Struct->prev))->next=centroid2Struct->next;
     if ( centroid2Struct->next != NULL ) {
        ((ClusterCentroid *)(centroid2Struct->next))->prev = 
			                             centroid2Struct->prev;
     }
     free( doomedCentroid->centroid );
     free( doomedCentroid );

     currentNumberOfClusters--;
}




void
SquareErrorClusterVariable::_normalizeData()
{
     double *mean = (double *)malloc( dimension * sizeof(double) );
     double *variance = (double *)malloc( dimension * sizeof(double) );
     double *stddev = (double *)malloc( dimension * sizeof(double) );
     int i, j;
     for (i=0; i<dimension; i++) {
         mean[i] = 0.0;
	 variance[i] = 0.0;
     }
     // Compute means across all dimensions (pass 1)
     for (i=0; i<numberOfDataPoints; i++) {
         for (j=0; j<dimension; j++) {
	     mean[j] += ( data[i*dimension+j] / numberOfDataPoints );
         }
     }
     // Compute variances along each dimension (pass 2)
     for (i=0; i<numberOfDataPoints; i++) {
         for (j=0; j<dimension; j++) {
             variance[j] += ( Square( data[i*dimension+j] - mean[j] ) /
	                      numberOfDataPoints );
         }
     }
     for (j=0; j<dimension; j++) {
/* fprintf(stderr, "Mean and variance %d are %lf, %lf\n", j, mean[j],
variance[j]); */
	 stddev[j] = sqrt( variance[j] );
     }

     // Reassign data array (pass 3)
     for (i=0; i<numberOfDataPoints; i++) {
	for (j=0; j<dimension; j++) {
            if ( stddev[j] == 0 ) {
               data[i*dimension+j] = 0.0;
            } else {
               data[i*dimension+j] = (data[i*dimension+j] - mean[j])/
	                             stddev[j];
            }
        }
/* fprintf(stderr, "New point %d is (%lf,%lf)\n", i, data[i*dimension],
data[i*dimension+1] ); */
     }
     // Compute C, R 
//     C = 0.01 * dimension;
//     R = 99.9 * dimension;
     C = 0.5 * dimension;
     R = 1.0 * dimension;

// fprintf(stderr, "C = %lf, R = %lf\n", C, R );
     free( stddev );
     free( variance );
     free( mean );
}



void
SquareErrorClusterVariable::_updateGainingCentroid( int centroid, int point )
{
     int i;
     ClusterCentroid *centroidStruct = _findCentroid( centroid );
     for (i=0; i<dimension; i++) {
        centroidStruct->centroid[i] = (
                  ((centroidStruct->centroid[i] * centroidStruct->count ) +
		  data[point*dimension+i]) / (centroidStruct->count + 1) );
     }
     centroidStruct->count++;
}



void
SquareErrorClusterVariable::cluster()
{
// fprintf(stderr, "Dimension is %d\n", dimension );
     _deallocateDataStructures();
// fprintf(stderr, "Deallocation done\n");
     _allocateDataStructures();
     _normalizeData();
// fprintf(stderr, "Thru normalization\n");
     // Assign initial "k" points to clusters
     int i;
     ClusterCentroid *centroidCurrent = clusterCentroidList; 

/* fprintf(stderr, "Number of clusters at start %d\n",
numberOfIntendedClusters); */

     currentNumberOfClusters = 0;
     for (i=0; i<numberOfIntendedClusters; i++) {
// fprintf(stderr, "Assigning point %d to a cluster\n", i );  
         _addNewCluster(i);
// fprintf(stderr, "Point %d assigned.\n", i );
     }
// fprintf(stderr, "Initial assignment done\n");
    
     // Compute all inter-cluster pairwise distances
     _computeCentroidSpacings();
// fprintf(stderr, "Centroid spacings done\n");
     _mergeClusters();
/* fprintf(stderr, "Did the first merge: number of clusters is %d\n",
currentNumberOfClusters ); */


     // Assign remaining points to clusters
     for (i=numberOfIntendedClusters+1; i<numberOfDataPoints; i++) {
         clustering[i] = _findCentroidClosestToPoint(i);
         if ( _computeDistanceToCentroid( i, clustering[i] ) > R ) {
// fprintf(stderr, "Adding point %d as a new cluster\n", i);
   
         _addNewCluster( i );
         } else {
// fprintf(stderr, "Adding point %d to cluster %d\n", i, clustering[i] );
             _updateGainingCentroid( clustering[i], i );
	     _computeCentroidSpacings();
             _mergeClusters();
// fprintf(stderr, "New number of clusters %d\n", currentNumberOfClusters);
	 }
     }

     // Make the second pass and reallocate each point now that the
     // seed points are fixed.
// fprintf(stderr, "Making second pass\n");
     for (i=0; i<numberOfDataPoints; i++) {
         clustering[i] = _findCentroidClosestToPoint(i);
// fprintf(stderr, "Point %d is in cluster %d\n", i, clustering[i] );
     }
}


int *
SquareErrorClusterVariable::getClusterCnts()
{
     if ( cntlist != NULL ) {
	free(cntlist);
     }
     cntlist = (int *)malloc( currentNumberOfClusters * sizeof(int));
     int i;  
     for (i=0; i<currentNumberOfClusters; i++) {
	 cntlist[i] = 0;
     }
     for (i=0; i<numberOfDataPoints; i++) {
	 cntlist[ clustering[i] ] ++;
     }

     return( cntlist );
}

int *
SquareErrorClusterVariable::getClusterReps( int howMany )
{
     static int *replist;
     if ( replist != NULL ) {
	 free ( replist );
     }
     replist = (int *)malloc( currentNumberOfClusters *howMany*sizeof(int) );

     int i, j;
         double currentDistance;
         double *maxDistances = (double *)malloc( currentNumberOfClusters*
		  	  		          dimension *sizeof(double));
         // First, find the max distances
         for (i=0; i<currentNumberOfClusters; i++) {
             maxDistances[i] = _computeDistanceToCentroid( 0, i );
         }
         for (i=1; i<numberOfDataPoints; i++) {
             currentDistance = _computeDistanceToCentroid( i,clustering[i] );  
             if ( currentDistance > maxDistances[ clustering[i] ] ) {
                maxDistances[ clustering[i] ] = currentDistance;
             }
         }
         // Then, partition the space 
         double *threshDistances = (double*)malloc(currentNumberOfClusters*
						   howMany* sizeof(double) );
	 Boolean_ *repFound = (Boolean_ *)malloc( currentNumberOfClusters*
						 howMany * sizeof(Boolean_) );
         for (i=0; i<currentNumberOfClusters; i++) {
             for (j=0; j<howMany; j++) {
                 threshDistances[i*howMany+j] = j*maxDistances[i];
	         replist[i*howMany+j] = -1;
	         repFound[i*howMany+j] = FALSE_;
	     }
         }
         free( maxDistances );

	 // Do it
	 int cluster, ring, index;
         double *repDistances = (double *)malloc( currentNumberOfClusters *
	                                          howMany * sizeof(double) );

         for (i=0; i<numberOfDataPoints; i++) {
	     cluster = clustering[i];
             currentDistance = _computeDistanceToCentroid( i, cluster );

             ring = howMany-1;
             index = cluster*howMany+ring;
// fprintf(stderr, "Ring %d:, curDist: %lf, thresDist: %lf\n", ring,
// currentDistance, threshDistances[index+1] );
	     while ( currentDistance < threshDistances[index] ) {
	        ring--;
    	        index--;
// fprintf(stderr, "Ring %d:, curDist: %lf, thresDist: %lf\n", ring,
// currentDistance, threshDistances[index+1] );
             }

	     if ((!repFound[index]) || (currentDistance<repDistances[index])){
		repDistances[index] = currentDistance;
		replist[index] = i;
	        repFound[index] = TRUE_;
             } 
         } 
         free( repDistances );
         free( repFound );

     return(replist);
}


	
	
int *
SquareErrorClusterVariable::getClusterReps()
{
     static int *replist = NULL;

     if ( replist != NULL ) {
// fprintf(stderr, "THIS IS IT:  %d\n", (int)replist);
	 free ( (void *)replist );
     }
// fprintf(stderr, "Beginning mallocs!!\n");
     replist = (int *)malloc( currentNumberOfClusters * sizeof(int) );
if ( replist == NULL ) {
// fprintf(stderr, " RETURNED NULL IN getClusterReps()\n" );
}
     double *repDistances = (double *)malloc( currentNumberOfClusters *
	                                      sizeof(double) );
// fprintf(stderr, "Finished mallocs\n");
     double currentRepDistance;
     int i, j;
     for (j=0; j < currentNumberOfClusters; j++) {
         repDistances[j] = _computeDistanceToCentroid( 0, j );
	 replist[j] = 0;
     }
// fprintf(stderr, "Rep distances 0 = %lf\n", repDistances[0] );
// fprintf(stderr, "SHIT points %d\n", numberOfDataPoints );

// fprintf(stderr, "there are %d data points\n", numberOfDataPoints );
     for (i=1; i < numberOfDataPoints; i++) {
         for (j=0; j < currentNumberOfClusters; j++) {
	     currentRepDistance = _computeDistanceToCentroid( i, j );
/*
if ( j == 0 ) {
  fprintf(stderr, "SHIT Checking distance %lf (point %d) against %lf \n",
currentRepDistance, i, repDistances[0] );
}
*/
	     if ( currentRepDistance < repDistances[j] ) {
                repDistances[j] = currentRepDistance;
		replist[j] = i;
	     }
	 }
     }
     free( repDistances );
// fprintf(stderr, "THIS WAS IT : %d, size %d\n", (int)replist, currentNumberOfClusters );
     return( replist );
}
	

int
SquareErrorClusterVariable::getNumberOfClusters()
{
     if ( currentNumberOfClusters > numberOfDataPoints ) {
        return numberOfDataPoints;
     } else {
        return currentNumberOfClusters;
     }
}

