#!/usr/bin/perl


# This program is compatible both with 32-bit Perl 4.036, and Perl 5.
#
# Copyright 2010, by David A. Burton
# Cary, NC  USA
# +1 919-481-0149
# Email: http://www.burtonsys.com/email/


# immediate output of debug prints
$| = 1;


# ---- these globals are for command-line options ----
$debugmode=0;  # >0 for debug prints
$givehelp=0;  # 1 for '-?' or '--help'
$method=0;  # 1 for ANOVA method, 2 for synthetic approx, 4 for simple average of monthly SDs,
  $method_anova_1c = 1;  # '-a' (default)
  $method_synth_2c = 2;  # '-s'
  $method_mean_4c = 4;   # '-m'
$samples_per_group = 30;  # '-n=30' is the default (assume 30 days per month)

while (($#ARGV >= 0) && ('-' eq substr($ARGV[0],0,1))) {
   if ('-d' eq $ARGV[0]) {
      $debugmode++;  # turn on debug prints
   } elsif ('-a' eq $ARGV[0]) {
      $method |= $method_anova_1c;  # use ANOVA calculation (best technique)
   } elsif ('-s' eq $ARGV[0]) {
      $method |= $method_synth_2c;  # use synthetic approximation technique (not an exact technique)
   } elsif ('-m' eq $ARGV[0]) {
      $method |= $method_mean_4c;  # use simple mean (average) of SDs (not a valid technique)
   } elsif ($ARGV[0] =~ /^\-n\=([0-9\.]+)$/) {
      $tmp = $1 + 0;
      if ($tmp >= 2) {
         $samples_per_group = $tmp;
         print "$tmp samples per group\n";
      }
   } elsif (('-?' eq $ARGV[0]) || ('-h' eq $ARGV[0]) ||('--help' eq $ARGV[0])) {
      $givehelp++;
   } else {
      printf "ERROR: unrecognized command-line option: '%s'\n", $ARGV[0];
      exit 1;
   }
   shift @ARGV;
}

if (!$method) {
   $method = $method_anova_1c;  # default is ANOVA method
}

if ($debugmode) {
  if ($method_anova_1c & $method) {
     print "SD calculation method = ANOVA (2), with $samples_per_group samples / group\n";
  }
  if ($method_synth_2c & $method) {
     print "SD calculation method = Synthetic approximation (1), with $samples_per_group samples / group\n";
  }
  if ($method_mean_4c & $method) {
     print "SD calculation method = simple average (0), (not a valid method!)\n";
  }
}

$num_args = $#ARGV+1;

if (($num_args != 1) || $givehelp) {
 print "composite_sd_calc.pl -- Calculate composite yearly standard deviations (and\n" .
       "  means) from church_white_grl_gmsl.lis, to match the format of the ones in\n" .
       "  church_white_new_gmsl.lis (which contains annual data instead of monthly).\n" .
       "\n" .
       "The calculation is done via an analysis of variance (ANOVA) calculation, and/or\n" .
       "optionally (and less precisely and rapidly) by simulating the original raw\n" .
       "data with a synthetic approximation.\n" .
       "\n" .
       "Usage:\n" .
       "\n" .
       "   perl -w $0 [options] church_white_grl_gmsl.lis\n" .
       "or:\n" .
       "   perl -w $0 [options] church_white_grl_gmsl.lis >annualized_gmsl.lis\n" .
       "\n" .
       "where [options] can be a combination (or none) of the following:\n" .
       "  -a  Use Analysis Of Variance exact calculation (default)\n" .
       "  -s  Use Synthetic distribution approximation calculation (valid, but inexact)\n" .
       "  -m  Use simple mean (average) of monthly SDs (this is NOT a valid technique!)\n" .
       "  -n=xxx  Assume xxx samples per group (default is 30)\n" .
       "  -d  enable Debug prints\n" .
       "  -?  print this Help message\n" .
       "\n";
       exit 1;
}


$perlver = "3 or earlier";
if ($] =~ /\$\$Revision\:\s*([0-9.]+)\s/) {
   $perlver = $1;  # probably 4.something
} elsif ($] =~ /([0-9][0-9.]*)/) {
   $perlver = $1;  # probably 5.something or 6.something
}
if ($debugmode) {
   print "You are using Perl version $perlver\n";
}

@INC = ('.','..');
require "composite_sd.pl";  # define &sum, &avg, &sample_SD and &composite_SD


# input is an integer; result is 1 (true) iff odd, null (false) if even
sub odd {
   local($i) = @_;
   return (($i & 1) == 1);
}


# for sorting (from the Camel Book)
sub numerically { $a <=> $b; }


# input is an array of numbers; output is the median
sub median {
   local(@vals) = sort numerically @_;
   local($m,$i);
   $i = int( 0.01 + ($#vals / 2) );
   $m = $vals[ $i ];
   if (&odd($#vals)) {
      # there are an even number of values
      $m += $vals[ $i+1 ];
      $m /= 2;
   }
   return $m;
}


# $inpfile = 'church_white_grl_gmsl.lis';
$inpfile = $ARGV[0];
if (!open(INP,"<$inpfile")) {
   printf "ERROR: could not open '%s', $!\n", $inpfile;
   exit 1;
}


# absolute value
sub abs {
   local( $x ) = shift;
   if ($x < 0) {
      $x = - $x;
   }
   return $x;
}


# private state variable used by &gaussian_rand
undef $g_rand_v2;

# This subroutine generates random numbers that are normally distributed,
# with a standard deviation of 1 and a mean of 0.  This is adapted from
# http://docstore.mik.ua/orelly/perl/cookbook/ch02_11.htm  and
# http://www.experts-exchange.com/Programming/Languages/Scripting/Perl/Q_20579434.html
# It is apparently called the "polar Box Muller method."
sub gaussian_rand {
   local( $v1, $r );
   if (defined $g_rand_v2) {
      # The method generates two samples, so the 2nd call to
      # this function just saves the squirelled away 2nd value.
      $v1 = $g_rand_v2;
      undef $g_rand_v2;
   } else {
      do {
         $v1 = rand(2) - 1;  # between -1 and 1
         $g_rand_v2 = rand(2) - 1;  # also between -1 and 1
         $r = ($v1*$v1) + ($g_rand_v2*$g_rand_v2);
      } while (($r >= 1) || ($r == 0));  # most of the time this does not iterate
      $r = sqrt( (-2 * log($r)) / $r );
      $v1 *= $r;
      $g_rand_v2 *= $r;
   }
   return $v1;
}


# The theoretical standard deviation of a uniform distribution with
# a range of 1 (like Perl's built-in rand function):
$tSD_c = 1 / (2 * sqrt(3));  # 0.2886751346


# Similar to gaussian_rand, but returns random numbers that are uniformly
# distributed, with a standard deviation of 1, a mean of 0, a minimum
# value of -1.732051, and a maximum value of 1.732051.
sub uniform_rand {
   local( $result ) = rand;
   $result -= 0.5;
   $result /= $tSD_c;
   return $result;
}


# Given a mean, standard deviation, and number of samples desired,
# return an array of synthesized samples which match those parameters,
# with a uniform distribution.
sub synthesize_samples {
   local( $mean, $sd, $n ) = @_;
   local( @result );
   local( $i );

   # sanity-check the number of samples, and do something reasonable if isn't an integer > 1
   $n = int( $n + 0.5 );
   if ($n < 2) {
      $n = 2;
   }

   # Quick & dirty approximation would be simply...
   # for ($i = 0; $i < $n; $i++) {
   #    $result[$i] = ($sd * &uniform_rand) + $mean;
   # }

   # But we want it to be exact.  So...
   # 1. fill in @result with a uniform distribution
   for ($i = 0; $i < $n; $i++) {
      $result[$i] = rand;
   }
   # 2. scale it to match the desired standard deviation
   local( $calcd_sd ) = &sample_SD( @result );
   for ($i = 0; $i < $n; $i++) {
      $result[$i] *= ($sd / $calcd_sd);
   }
   # 3. bias it to match the desired mean
   local( $calcd_mean ) = &avg( @result );
   for ($i = 0; $i < $n; $i++) {
      $result[$i] += ($mean - $calcd_mean);
   }
   return @result;
}


# Calculate combined standard deviation via brute force sythetic sample generation.
# Inputs are:
#   $G, the number of groups
#   @means, the array of $G group means
#   @SDs, the array of $G group standard deviations
#   $ncount or @ncounts -- number of samples in each group (can be scalar
#                          if all groups have same number of samples)
# Result is the overall standard deviation.
sub composite_SD_synthetic {
   local( $G ) = shift @_;
   local( @means ) = splice( @_, 0, $G );
   local( @SDs ) = splice( @_, 0, $G );
   local( @ncounts );
   local( $i );
   if (0 == $#_) {
      for ($i=($G-1); $i >= 0; $i--) {
         $ncounts[$i] = $_[0];
      }
   } else {
      @ncounts = @_;
   }
   if (   ($G != ($#means + 1))
       || ($G != ($#SDs + 1))
       || ($G != ($#ncounts + 1)) ) {
      die "ERR: sub composite_SD_synthetic called w/ wrong number of parameters [$G,$#means,$#SDs,$#ncounts]\n";
   }
   # Okay, we have the parameters, now.

   # calculate total number of samples, N, and grand mean, GM
   local($N) = &sum(@ncounts);  # total number of samples
   if ($N <= 1) {
      print "Warning: only $N samples, SD is incalculable\n";
      return -1;
   }
   local($GM) = 0;
   for ($i=0; $i<$G; $i++) {
      $GM += ($means[$i] * $ncounts[$i]);
   }
   $GM /= $N; # grand mean

   # Generate the synthetic sample values for each group
   local( @fullset, @subset );
   @fullset = ();
   for ($i= 0; $i <= $#means; $i++) {
      @subset = &synthesize_samples( $means[$i], $SDs[$i], $ncounts[$i] );
      push( @fullset, @subset );
   }
   # sanity-check the mean
   local($mean2) = &avg( @fullset );
   if (&abs($GM-$mean2) > 0.001) {
      die "ERR: yr=$year, GM=$GM != m2=$mean2\n";
   }
   # calculate the standard deviation
   local( $result ) = &sample_SD( @fullset );
   if ($debugmode) { printf "dbg composite_SD_synthetic: N = $N, GM = %4.3f, SD = %4.3f\n", $GM, $result; }
   return $result;
} #composite_SD_synthetic



# print header line; if -a -s and -m are all chosen, it'll look like this:
#     " Year      mean       SD      ~SD            diff         avgSD"
#     " 1932.5   -24.31    10.68    10.68    0.00000000000001    10.66"
print " Year      mean ";
if ($method & $method_anova_1c) { print "      SD "; }
if ($method & $method_synth_2c) { print "     ~SD "; }
if (($method & ($method_synth_2c|$method_anova_1c)) == ($method_synth_2c|$method_anova_1c)) { print "           diff     "; }
if ($method & $method_mean_4c)  { print "    avgSD";}
print "\n";

# SD is the standard deviation calculated by ANOVA ("-a" or default)
# ~SD is the standard deviation calculated by approximating with synthetic data ("-s")
# diff is the absolute value of the difference between SD and ~SD.
# avgSD is the average of the group standard deviations (not a valid method!) ("-m")



# These globals are used as inputs to &analyze_one_yr:
#   @m is an array of 12 means
#   @sd is an array of 12 standard deviations
@m = @sd = ();


# four-digit $year is passed as the input parameter, but the main inputs are globals @m and @sd
sub analyze_one_yr {
   local( $year ) = @_;
   local( $sd, $sd1, $sd_a, $sd_s, $sd_m, $mean1, $i );
   if (-1 == $#m) {
      return;
   }
   if ((11 != $#m) || (11 != $#sd)) {
      die "ERR: analyze_one_yr, not 12 months?!?  year=$year, \$\#m=$#m, \$\#sd=$#sd\n";
   }
   # Now the 12 means and 12 SDs are in @m and @sd
   $mean1 = &avg( @m );
   $sd1 = '';
   if ($method & $method_anova_1c) {
      $sd_a = &composite_SD( 1+$#m, @m, @sd, $samples_per_group );
      $sd1 .= sprintf( "    %5.2f", $sd_a );
   }
   if ($method & $method_synth_2c) {
      $sd_s = &composite_SD_synthetic( 1+$#m, @m, @sd, $samples_per_group );
      $sd1 .= sprintf( "    %5.2f", $sd_s );
   }
   if (($method & ($method_synth_2c|$method_anova_1c)) == ($method_synth_2c|$method_anova_1c)) {
      $sd1 .= sprintf( "    %16.14f", &abs($sd_s-$sd_a) );
   }
   if ($method & $method_mean_4c) {
      $sd_m = &avg( @sd );  # WARNING: this is NOT a valid method!
      $sd1 .= sprintf( "    %5.2f", $sd_m );
   }
   if (!defined $year) {
      $year = '0000';
   }
   printf " %s.5  %7.2f%s\n", $year, $mean1, $sd1;
   @m = @sd = ();
} #analyze_one_yr


# SAMPLE INPUT:
#
#   years    GMSL (mm)   SD
# ---------  ---------  -----
# 1870.0417   -75.61    22.19
# 1870.1250   -75.68    22.19
# 1870.2083   -74.13    22.19
# 1870.2917   -74.06    22.19
# 1870.3750   -73.23    22.19
# 1870.4583   -74.47    22.19
# 1870.5417   -72.76    22.19
# 1870.6250   -70.71    22.19
# 1870.7083   -71.18    22.19
# 1870.7917   -70.77    22.19
# 1870.8750   -78.15    22.19
# 1870.9583   -81.73    22.19
# 1871.0417   -86.35    22.19
# 1871.1250   -82.06    22.19
# 1871.2083   -78.42    22.19
#...
# 2001.7917   118.71     8.94
# 2001.8750   114.17     9.05
# 2001.9583   110.85     9.05


$year = '0000';
@m = @sd = ();
$count_inp_lines = 0;
while (<INP>) {
   $count_inp_lines++;
   $_ =~ s/^\s+//g;  # delete any leading whitespace
   if ($debugmode) {
      print "dbg: in = $_";
   }
   ($year_i, $gmsl_i, $sd_i) = split( /\s+/, $_ );
   if ($year_i !~ "^$year") {
      # when we see a new year, process the data for the previous year
      &analyze_one_yr($year);
      # now save the January values for the new year
      $year = substr($year_i,0,4);
      @m = ($gmsl_i);
      @sd = ($sd_i);
   } else {
      # save the February through December values
      push( @m, $gmsl_i );
      push( @sd, $sd_i );
   }
}
# at end-of-file; the final year's data still needs to be analyzed
if ($#m > 0) {
   &analyze_one_yr($year);
}
close INP;

__END__

