# a nawk script
# we expect the following columns:
# systemname iop sort iop*sort r r-trials
#   where systemname is a string
#   iop sort   are levels (-1, 1)
#   y is a floating-point number (mean of the trials)
#   r is the number of trials to follow
#
# note that the variable t must be changed by hand for each value of k,r


# Part of
#              The STARFISH Parallel file-system simulator
#        (Simulation Tool for Advanced Research in File Systems)
# 
#                               David Kotz
#                           Dartmouth College
#                              Version 3.0
#                              January 1996
#                          dfk@cs.dartmouth.edu
#



function init()
{
    for (i = 0; i < cols; i++)
      dotprod[i] = 0;
    rows = 0;
    SSY = 0;
}

# print a header
function header()
{
    printf "%-25s", "system name";

# mean effect
    printf "%10s", "I";
    
# direct effects
    for (i = 1; i <= k; i++)
      printf "%10d", i;

# and the 2-factor interactions
    for (i = 1; i <= k; i++)
      for (j = i+1; j <= k; j++)
	printf "%9d%d", i,j;

    printf "%10s", "Error";
    printf "\n";
}

# compute the effects for the previous pattern
function find_effects()
{
    for (i = 0; i < cols; i++)
      effect[i] = dotprod[i] / rows;
}

# assign the fractional variations for the previous pattern
function find_variations()
{
# find SS0, SSA, SSB, SSAB, etc, along with SSE
    SSE = SSY;
    for (i = 0; i < cols; i++) {
      SS[i] = 2^k * r * effect[i] * effect[i];
      SSE -= SS[i];
    }

# find SST
    SST = SSY - SS[0];

# find the percent variations (not for [0])
    for (i = 1; i < cols; i++) {
	variation[i] = SS[i] / SST * 100;
    }
    variationE = SSE / SST * 100;
}

# compute the standard deviations and confidence intervals
function find_confidence()
{
# find the Mean Squared of Error (MSE)
    MSE = SSE / (2^k * (r-1));
    sd = sqrt(MSE / (2^k * r));	# same for all cols
    
# t-distribution t[0.95; 2^k * (r-1)], ie, 90% confidence 
    t = 1.746;			# this is for k=2, r=5

# find the confidence intervals
    for (i = 0; i < cols; i++) {
	conflo[i] = effect[i] - t * sd;
	confhi[i] = effect[i] + t * sd;
    }
}

# print the four-line summary for the previous pattern
function doprint(sys)
{
    printf "%-25s", sys;
    for (i = 0; i < cols; i++)
      printf "%10.2f", effect[i];
    printf "\n";

#    printf "%-25s", "  SS";
#    for (i = 0; i < cols; i++)
#      printf " %10e", SS[i];
#    printf " %10e %10e\n", SSE, SST;

    printf "%-25s%10s", "  %variation", "";
    for (i = 1; i < cols; i++)
      printf "%10.2f", variation[i];
    printf "%10.2f", variationE;
    printf "\n";

    printf "%-25s", "  conf.int.hi";
    for (i = 0; i < cols; i++)
      printf "%10.2f", confhi[i];
    printf "\n";

    printf "%-25s", "  conf.int.lo";
    for (i = 0; i < cols; i++)
      printf "%10.2f", conflo[i];
    printf "\n";

    printf "%-25s", "  significant?";
    for (i = 0; i < cols; i++)
      printf "%10s", (conflo[i] * confhi[i] > 0 ? "yes" : "no");
    printf "\n";

    printf "\n";
}

################################################
BEGIN {
    k = 2;			# number of direct factors
    # systemname, k factors, k*(k-1)/2 2-factor effects, r, r trials
      rfield = 1 + k + k*(k-1)/2 + 1; # input field where we find r
    # output columns
    cols = 1 + k + k*(k-1)/2;	# mean, 1-factor, and 2-factor effects
    first = 1;
    header();
    init();
}

################################################
# for every input line
# note that $rfield is the number of trials, in fields rfield+1...
NF == rfield + $(rfield) {
    if ($1 != lastsys && !first) {
	find_effects();		# compute effects for previous pattern
	find_variations();	# assign variations for previous pattern
	find_confidence();	# find conf interval for previous pattern
	doprint(lastsys);	# print summary for previous pattern
	init();			# reset for this new pattern
    }

    if (!first && $rfield != r) {
	print "Warning: r is different (",$rfield,") than before (",r,")";
    }

    first = 0;
    rows++;
    lastsys = $1;

    r = $rfield;		# number of trials

# compute the average y value
    y = 0;
    for (i = 1; i <= r; i++)
	y += $(rfield+i);	# take a log here if multiplicative model
    y /= $rfield;		# average

# keep track of the sum of squares of y values
    for (i = 1; i <= r; i++)
      SSY += $(rfield+i) * $(rfield+i);

# effect of the mean
    col = 0;
    dotprod[col++] += y;

# direct effects
    for (i = 1; i <= k; i++) {
	field = i+1;
	dotprod[col++] += $field * y;
    }

# and the 2-factor interactions
    for (i = 1; i <= k; i++) {
	ifield = i+1;
	for (j = i+1; j <= k; j++) {
	    jfield = j+1;
	    dotprod[col++] += $ifield * $jfield * y;
	}
    }

    next;			# skip rule below
}

# all other input lines
{
    printf "Bad input line '%s' (%d fields)\n", $0, NF;
}

END {
    find_effects();		# compute effects for previous pattern
    find_variations();		# assign variations for previous pattern
    find_confidence();		# find conf interval for previous pattern
    doprint(lastsys);		# summary for last pattern
}

