#!/usr/bin/perl

# A program to calculate, for the 159 GLOSS-LTT tide stations in NOAA's list,
# a geographically-weighted average linear Mean Sea Level trend, using
# weightings determined by geographic proximity to other GLOSS-LTT tide
# stations.
#
# Here's NOAA's GLOSS-LTT list:
# http://tidesandcurrents.noaa.gov/sltrends/MSL_global_trendtable.html
#
# Run this program with no parameters for help.
#
# This program is compatible both with 32-bit Perl 4.036, and Perl 5.
#
# Copyright 2010, by David A. Burton
# Cary, NC  USA
# +1 919-481-0149
# Email: http://www.burtonsys.com/email/


# immediate output of debug prints
$| = 1;


@INC = ('.','..');
require "splitcsv.pl";  # define &split_csv
require "composite_sd.pl";  # define &sum, &avg, &sample_SD and &composite_SD


@sv_ARGV = @ARGV;



# These field numbers can be overridden by command-line options.
$fn_name = 1;  # first field is normally the name of the station
$fn_years = 4;  # 4th field is normally the approximate number of years the station operated
$fn_trend = 5;  # 5th field is normally the local MSL trend for the station
$fn_ci = 6;  # 6th field is normally the 95% Confidence Interval for the MSL trend
$fn_lat = 13;  # 13th field is normally the latitude of the station
$fn_lon = 14;  # 14th field is normally the longitude of the station

# other options:
$opt_sort = 0;  # default is no sorting
$opt_firstrec = 1;  # default is to process all records
$opt_lastrec = -1;  # default is to process all records
$opt_duptest = 0;  # default is to skip the duplicate-records sanity test
$opt_export = 0;  # default is to not create the geo_weights.csv file
$limitDist = 800;  # "limit" -- distance beyond which correlation weight is zero (default is 800 km)
$kneeDist = 400;  # "knee" -- distance at which correlation weight is 0.3333 (default is 400 km)
$kneeDist_specified = 0;
$limitDist_specified = 0;
$opt_alldist = 0;  # default is to not calculate for broad range of knee & limit values
$opt_lmsl = 0;  # default is to not calculate SD & 95% CI for LMSLs

$debugmode=0;  # for debug prints
while (($#ARGV >= 0) && ('-' eq substr($ARGV[0],0,1))) {
   if ('-d' eq $ARGV[0]) {
      $debugmode++;  # turn on debug prints
   } elsif ('-name=' eq substr($ARGV[0],0,6)) {
      $fn_name = substr($ARGV[0],6) + 0;  # field number of the "name" field
   } elsif ('-trend=' eq substr($ARGV[0],0,7)) {
      $fn_trend = substr($ARGV[0],7) + 0;  # field number of the "MSL_trend" field
   } elsif ('-lat=' eq substr($ARGV[0],0,5)) {
      $fn_lat = substr($ARGV[0],5) + 0;  # field number of the "latitude" field
   } elsif ('-lon=' eq substr($ARGV[0],0,5)) {
      $fn_lon = substr($ARGV[0],5) + 0;  # field number of the "longitude" field
   } elsif ('-years=' eq substr($ARGV[0],0,7)) {
      $fn_years = substr($ARGV[0],7) + 0;  # field number of the "number_of_years" field
   } elsif ('-sort' eq substr($ARGV[0],0,5)) {
      $opt_sort++;  # sort by MSL trend
   } elsif ('-duptest' eq substr($ARGV[0],0,8)) {
      $opt_duptest++;  # do the duplicate-records sanity test of geographically-weighted averaging
   } elsif ('-first=' eq substr($ARGV[0],0,7)) {
      $opt_firstrec = substr($ARGV[0],7) + 0;  # record number of first record to process
   } elsif ('-last=' eq substr($ARGV[0],0,6)) {
      $opt_lastrec = substr($ARGV[0],6) + 0;  # record number of last record to process
   } elsif ('-limit=' eq substr($ARGV[0],0,7)) {
      $limitDist = substr($ARGV[0],7) + 0;  # distance beyond which correlation is zero
      $limitDist_specified = 1;
   } elsif ('-knee=' eq substr($ARGV[0],0,6)) {
      $kneeDist = substr($ARGV[0],6) + 0;  # distance beyond which correlation is < 0.3333
      $kneeDist_specified = 1;
   } elsif ('-alldist' eq substr($ARGV[0],0,8)) {
      $opt_alldist++;  # test the full range of possible maximum correlation distances
   } elsif ('-lmsl' eq substr($ARGV[0],0,5)) {
      $opt_lmsl++;  # also calculate SD & 95% CI for LMSLs, rather than just for global avg MSL
   } elsif ('-export' eq substr($ARGV[0],0,7)) {
      $opt_export++;  # create the geo_weights.csv file
   } else {
      printf STDERR "ERROR: unrecognized command-line option: '%s'\n", $ARGV[0];
      exit 1;
   }
   shift @ARGV;
}
if ($limitDist_specified && ! $kneeDist_specified) {
   $kneeDist = $limitDist / 2.0;
} elsif ($kneeDist_specified && ! $limitDist_specified) {
   $limitDist = 2 * $kneeDist;
}


$cmd_already_echoed = 0;
sub echo_cmd {
  if (!$cmd_already_echoed) {
     $perlname = "$^X";
     $perlname =~ s/^.+[\\\/]//;  # strip off the path, leaving only the file name
     print "\n$perlname $0 " . join(' ', @sv_ARGV) . "\n";
     $cmd_already_echoed = 1;
  }
}


if ($debugmode > 0) {
   &echo_cmd;
}



# some needed constants:
$pi = 3.1415926535897932384626433832795;
$pi_over_180 = $pi / 180.0;  # 0.017453292519943295769236907684886;
$radius_of_earth = 6366707.0;  # in meters


$num_args = $#ARGV+1;

if ($num_args != 1) {
 print "calculate_distance_weighted_msl_avg2.pl --\n" .
       "\n" .
       "   Calculate geographical distance-weighted global linear Mean Sea Level (MSL)\n" .
       "   average from GLOSS-LTT tide stations data.\n" .
       "\n" .
       "Typical usage:\n" .
       "   perl $0 MSL_global_trendtable.csv\n" .
       "\n" .
       "Or, with options specified:\n" .
       "   perl $0 {options} MSL_global_trendtable.csv\n" .
       "\n" .
       "E.g., for debug prints:\n" .
       "   perl $0 -d MSL_global_trendtable.csv\n" .
       "\n" .
       "'MSL_global_trendtable.csv' is processed as follows:\n" .
       "\n" .
       "1) Read all 159 records, noting the Mean Sea Level (MSL) trend for\n" .
       "   each station and the station's coordinates.\n" .
       "2) For each station location, check it against every other location, calculate the\n" .
       "   distance D between the two locations, and the correlation weight W calculated\n" .
       "   from the distance.  Then (unless W=0) add the name/distance/weight triple\n" .
       "   to the list of nearby stations for that station location.\n" .
       "3) Also, for each location, sum the correlation weights of the nearby locations.\n" .
       "4) For each station, calculate its own weight W_own (to a maximum of 1.0) from the\n" .
       "   summed weights of the nearby stations, according to the formula:\n" .
       "     W_own = 1.0 / (1.0 + sum(nearby_weights))\n" .
       "   Thus, an isolated station has a weight of 1.0, but a station near other stations\n" .
       "   has a lower weight; e.g., a station which is very near just 1 other station has a\n" .
       "   weight of about 0.5.\n" .
       "5) Finally, calculate the geographically weighted average linear MSL trend from all\n" .
       "   159 stations, and display it.\n" .
       "\n" .
       "The following options are supported:\n" .
       "  -sort     (sort the records by increasing MSL trend before processing)\n" .
       "  -first=xx (where xx is first record number to process; default xx=1)\n" .
       "  -last=xx  (where xx is last record to process; default=all, normally xx=159)\n" .
       "  -limit=xx (set 'limit' distance in km, where correlation is 0; default xx=800)\n" .
       "  -knee=xx  (set 'knee' distance, where correlation is 0.3333; default is 1/2 'limit')\n" .
       "  -duptest  (do a 'duplicate records test' of geographically-weighted averaging)\n" .
       "  -alldist  (test a broad range of possible '-limit' maximum correlation distances)\n" .
       "  -lmsl     (also calculate SD \& 95% CI for LMSLs, as well as for global avg MSL)\n" .
       "  -export   (create the geo_weights.csv file, to export the geographical weights)\n" .
       "  -name=xx  (where xx is field number of the station name, default xx=1)\n" .
       "  -years=xx (where xx is field number of the \# years of operation, default xx=4)\n" .
       "  -trend=xx (where xx is field number of the MSL trend, default xx=5)\n" .
       "  -lat=xx   (where xx is field number of the latitude, default xx=13)\n" .
       "  -lon=xx   (where xx is field number of the longitude, default xx=14)\n" .
       "  -d        (enable debug prints)\n" .
       "  -d -d     (enable more debug prints)\n" .
       "\n" .
       "Problems/questions?  Call or email:\n" .
       "Dave Burton\n" .
       "http://www.burtonsys.com/email/\n" .
       "+1-919-481-0149\n" .
       "\n" .
       "Note: if this help message doesn't entirely fit on your screen, then do this:\n" .
       "\n" .
       "  perl $0 | more\n";
       exit 1;
}

$inpfile = $ARGV[0];
if (!open(INP,"<$inpfile")) {
   printf STDERR "ERROR: could not open '%s', $!\n", $inpfile;
   exit 1;
}
if ($debugmode > 0) {
   print "reading '$inpfile' . . .\n";
}


# Test a string to see if it looks like a Degrees/Minutes latitude
# or longitude.  Result is 'LAT' or 'LON' or undef.
sub is_DM {
  local( $fld ) = @_;
  local( $result );
  undef $result;
  local( $deg, $min, $dir );
  if ((defined $fld) && ($fld =~ /^([0-9]+)[ \,]([0-9][0-9\.]*)( |)([NSEW])$/)) {
     # found a latitude or longitude
     $deg = $1;
     $min = $2;
     # I should test whether $deg,$min is in [-90..90] for latitude
     # or in [-180..180] for longitude, but I didn't bother.
     $dir = $4;
     if ($dir =~ /[NS]/) {
        $result = "LAT";
     } else {
        $result = "LON";
     }
  }
  return $result;
}


# Convert a Degrees,Minutes representation of latitude or longitude
# to decimal degrees.
sub DegMin_to_DecimalDeg {
  local( $fld ) = @_;
  local( $deg, $min, $dir );
  if ($fld =~ /^([0-9]+)[ \,]([0-9][0-9\.]*)( |)([NSEW])$/) {
     # found a latitude or longitude
     $deg = $1;
     $min = $2;
     $dir = $4;
     $deg = $1 + ($2 / 60);
     if ($dir =~ /[SW]/) {
        $deg = -$deg;
     }
     # if (length($deg) > 10) {
     #    $deg = sprintf("%4.5f",$deg);
     # }
  } else {
     print STDERR "ERR: DegMin_to_DecimalDeg(\"$fld\")  (not a recognized latitude or longitude)\n";
     undef $deg;
  }
  return $deg;
}



$num_recs = 0;
# Read the spreadsheet records (.csv format) into five arrays:
#   @rec_name (names of the stations)
#   @rec_years (no. of years of operation for each station)
#   @rec_trend (MSL trend at each station)
#   @rec_ci (95% confidence interval for each station's MSL trend)
#   @rec_lat (latitude of each station)
#   @rec_lon (longitude of each station)
# $num_recs = number of stations (should be 159)
while (<INP>) {
   chop;
   $_ =~ s/[\r\n]*$//;  # because Perl 4 lacks chomp (to strip both cr and lf), make sure here

   # Split into fields.
   @sfields = &split_csv( $_ );
   $num_fields_with_content = 0;
   $first_field_is_only_field = 1;
   $fieldnumber = 0;
   foreach $fld (@sfields) {
      # get rid of leading and trailing whitespace in each field:
      if ($fld =~ /\s/) {
         $fld =~ s/^\s*//;  # strip leading whitespace, too.
         $fld =~ s/\s*$//;  # strip trailing blanks, tabs, cr, lf, etc.
         # get rid of any tabs, and collapse multiple spaces to one space:
         if ($fld =~ /\t|\s\s/) {
            $fld =~ s/\s+/ /g;
         }
      }
      if ('' ne $fld) {
         $num_fields_with_content++;
         if ($fieldnumber > 0) {
            $first_field_is_only_field = 0;
         }
      }

      # # upper-case each field value
      # $fld =~ tr/a-z/A-Z/;

      $fieldnumber++;
   }
   if (!$num_fields_with_content) {
      next;  # skip empty records
   }

   # 1st field is normally the name of the station
   # 4th field is normally the no. of years the station operated
   # 5th field is normally the MSL trend for the station
   # 6th field is normally the 95% confidence interval for the station's MSL trend
   # 13th field is normally the latitude of the station
   # 14th field is normally the longitude of the station
   # These field numbers can be overridden by command-line options.

   if ( (&is_DM($sfields[$fn_lat-1]) ne 'LAT') ||
        (&is_DM($sfields[$fn_lon-1]) ne 'LON') ) {
      die "ERR: record no. %d ('%s') lacks Lat/Lon in fields $fn_lat-$fn_lon: F$fn_lat='', F$fn_lon=''.\n", $num_recs, $sfields[0], $sfields[$fn_lat-1], $sfields[$fn_lon-1];
   }
   $rec_name[$num_recs] = $sfields[$fn_name-1];
   $rec_years[$num_recs] = $sfields[$fn_years-1] + 0;
   $rec_trend[$num_recs] = $sfields[$fn_trend-1] + 0.0;
   $rec_ci[$num_recs] = $sfields[$fn_ci-1] + 0.0;
   $rec_lat[$num_recs] = &DegMin_to_DecimalDeg($sfields[$fn_lat-1]);
   $rec_lon[$num_recs] = &DegMin_to_DecimalDeg($sfields[$fn_lon-1]);
   $num_recs++;
}
print "$num_recs station records were read from $inpfile\n";
close INP;

if ($num_recs != 159) {
   print STDERR "Warning: input file should have 159 records (one per GLOSS-LTT\n" .
         "tide station), but it actually has $num_recs records.\n";
}

if (-1 == $opt_lastrec) {
   $opt_lastrec = $num_recs;
   if ($debugmode >= 2) {
      print "dbg: \$opt_lastrec defaulted to \$num_recs=$num_recs\n";
   }
}

if (($opt_lastrec <= 0) || ($opt_firstrec <= 0)) {
   die "ERROR: record number cannot be less than 1\n";
}
if ($opt_lastrec > $num_recs) {
   die "ERROR: you specified -last=$opt_lastrec but there are only $num_recs records.\n";
}
if ($opt_firstrec > $opt_lastrec) {
   die "ERROR: you specified -first=$opt_firstrec, but it must be <= -last=$opt_lastrec.\n";
}
if ((($opt_firstrec != 1) || ($opt_lastrec != $num_recs))) {
   if (! $opt_sort) {
      print STDERR "Warning: specifying '-first=' and/or '-last=' without '-sort' is probably a mistake.\n";
   } else {
      if (($opt_firstrec - 1) != ($num_recs - $opt_lastrec)) {
         printf STDERR "Warning: you've omitted the %d lowest MSL trends but the %d highest MSL\n" .
                "trends.  Omitting differing numbers of sites at the low and high ends of\n" .
                "the range of MSL trends may bias the results.\n",
                ($opt_firstrec - 1), ($num_recs - $opt_lastrec);
      }
   }
}
if ($limitDist < (1.5 * $kneeDist)) {
   if ($limitDist < $kneeDist) {
      die "ERROR: '-limit' must be >= '-knee'\n";
   } else {
      print STDERR "Warning: concave curve; '-limit' should normally be least 1.5 times '-knee'\n";
   }
}



# Vaguely like the 'iota' function in APL, generate indices from M to N.
# Example:  &range(0,5) returns (0,1,2,3,4,5)
sub range {
   local( $M, $N ) = @_;
   local( @result, $i );
   @result = ();
   for ($i = $M; $i <= $N; $i++) {
      push( @result, $i );
   }
   return @result;
}


# For sorting a set of indices to an array of values (hard-coded to be @rec_trend)
# this is the compare function for sort.
sub compare_indirectly {
   # print "dbg: compare_indirectly, a=$a, b=$b, ofs(a)=" . $rec_trend[$a] . ", ofs(b)=" . $rec_trend[$b] . "\n";
   $rec_trend[$a] <=> $rec_trend[$b];
}


# Reorder an array according to an array of indices.  Example:
#   @indices = (0, 4, 1, 3, 2);
#   @values = (100, 101, 102, 103, 104 );
#   @result = &index_by( $#indices, @indices, @values );
# and @result is:
#   (100, 104, 101, 103, 102)
# If Perl were better designed, instead of calling this function you could just do this:
#   @result = $values[ @indices ];
# (What actually happens in that case is that @indices is evaluated in a scalar
# context, and returns the number of elements in the array, which is 5.  So
# $values[@indices] = $values[5] = undefined.)
sub indexby {
   local( $numindices ) = 1 + shift @_;
   local( @indices ) = splice( @_, 0, $numindices );
   local( @values ) = @_;
   local( @result, $index );
   @result = ();
   foreach $index (@indices) {
      push( @result, $values[$index] );
   }
   return @result;
}

# test it:
# @indices = (0, 4, 1, 3, 2);
# @values = (100, 101, 102, 103, 104 );
# @result = &indexby( $#indices, @indices, @values );  # @result = $values[ @indices ];
# print "dbg: (" . join(',',@result) . ")\n\n";  # should be (100,104,101,103,102)


@indices = &range( 0, $#rec_trend );  # 0..158


# '-sort' specified on command line
if ($opt_sort) {
   # Sort the 159 records by MSL trend:
   @indices = sort compare_indirectly @indices;
   if ($debugmode > 0) {
      printf "dbg: Sorting %d records.\n", 1+$#indices;
   }
}
# '-last=...' was specified on command line
if ($opt_lastrec < $num_recs) {
   # discard some of the last (if sorted, the highest MSL trend) records
   splice( @indices, $opt_lastrec );
   if ($debugmode > 0) {
      printf "dbg: Removing last %d records.\n", ($num_recs - $opt_lastrec);
   }
}
# '-first=...' was specified on command line
if ($opt_firstrec > 1) {
   # discard some of the first (if sorted, the lowest MSL trend) records
   splice( @indices, 0, ($opt_firstrec-1) );
   if ($debugmode > 0) {
      printf "dbg: Removing first %d records.\n", ($opt_firstrec - 1);
   }
}
# the above operations just manipulated @indices; here we use @indices to
# actually reorder the data arrays:
if ($opt_sort || ($opt_lastrec < $num_recs) || ($opt_firstrec > 1)) {
   # reorder and/or take subsets of all the input arrays, according to
   # the -first, -last & -sort command-line options
   @rec_trend = &indexby( $#indices, @indices, @rec_trend );  # (MSL trend at each station)
   @rec_ci = &indexby( $#indices, @indices, @rec_ci );  # (95% confidence interval for each station's MSL trend)
   @rec_name = &indexby( $#indices, @indices, @rec_name );    # (names of the stations)
   @rec_years = &indexby( $#indices, @indices, @rec_years ); # (no. of years of operation of each station)
   @rec_lat = &indexby( $#indices, @indices, @rec_lat );      # (latitude of each station)
   @rec_lon = &indexby( $#indices, @indices, @rec_lon );      # (longitude of each station)
}
undef @indices;

# printf 'dbg: $#rec_trend=%d, $num_recs=%d' . "\n", $#rec_trend, $num_recs;
if ($num_recs != (1+$#rec_trend)) {
   &echo_cmd;  # echo the command line (to show the options specified)
   printf "*** %d of %d records excluded from processing.\n", ($num_recs-(1+$#rec_trend)), $num_recs;
   $num_recs = $#rec_trend + 1;
}



# input is an integer; result is 1 (true) iff odd, null (false) if even
sub odd {
   local($i) = @_;
   return (($i & 1) == 1);
}


# for sorting (from the Camel Book)
sub numerically { $a <=> $b; }


# input is an array of numbers; output is the median
sub median {
   local(@vals) = sort numerically @_;
   local($m,$i);
   $i = int( 0.01 + ($#vals / 2) );
   $m = $vals[ $i ];
   if (&odd($#vals)) {
      # there are an even number of values
      $m += $vals[ $i+1 ];
      $m /= 2;
   }
   return $m;
}



# Compute distance between two locations via the Haversine Formula.
# (per http://mathforum.org/library/drmath/view/51879.html)
# Input is the two (latitude,longitude) pairs, in degrees.
# Output is the distance between the two locations, in meters.
# Formula is:
#   dlon = lon2 - lon1
#   dlat = lat2 - lat1
#   a = (sin(dlat/2))^2 + cos(lat1) * cos(lat2) * (sin(dlon/2))^2
#   c = 2 * atan2(sqrt(a), sqrt(1-a))
#   d = R * c
sub distance {
   local( $lat1, $lon1, $lat2, $lon2 ) = @_;
   # print "dbg: calculate distance between ($lat1,$lon1) and ($lat2,$lon2)\n";
   local( $result ) = 0.0;
   # first, convert degrees to radians:
   $lat1 *= $pi_over_180;
   $lon1 *= $pi_over_180;
   $lat2 *= $pi_over_180;
   $lon2 *= $pi_over_180;
   # then calcuate according to the Haversine Formula:
   local( $dlon ) = $lon2 - $lon1;
   local( $dlat ) = $lat2 - $lat1;
   local( $a1 ) = sin( $dlat/2.0 );
   $a1 = $a1 * $a1;
   local( $a2 ) = sin( $dlon/2.0 );
   $a2 = $a2 * $a2;
   local( $a ) = $a1 + (cos($lat1) * cos($lat2) * $a2);
   local( $c ) = 2.0 * atan2( sqrt($a), sqrt(1.0-$a) );
   $result = $radius_of_earth * $c;
   return $result;
}


## These are some sanity checks, which I used to verify that &distance
## works correctly (it does):
#
# $d1 = &distance( 0.0, 0.0, 0.0, 1.0 );
# # should be 1/360 of the circumference of the earth, which is
# # (1/360) * 40,003,200 meters = 111,120 meters.
# print " $d1 should be about 111,120 meters.\n";
#
# $d1 = &distance( 0.0, -1.0, 0.0, 0.0 );
# # should be 1/360 of the circumference of the earth, which is
# # (1/360) * 40,003,200 meters = 111,120 meters.
# print " $d1 should be about 111,120 meters.\n";
#
# $d1 = &distance( 0.0, -1.0, 0.0, 1.0 );
# # should be 2/360 of the circumference of the earth, which is
# # (2/360) * 40,003,200 meters = 222,240 meters.
# print " $d1 should be about 222,240 meters.\n";
#
# $d1 = &distance( 0.0, 1.0, 0.0, -1.0 );
# # should be 2/360 of the circumference of the earth, which is
# # (2/360) * 40,003,200 meters = 222,240 meters.
# print " $d1 should be about 222,240 meters.\n";
#
# $d1 = &distance( 0.0, -90.0, 0.0, 90.0 );
# # should be 1/2 of the circumference of the earth, which is
# # (1/2) * 40,003,200 meters = 20,001,600 meters.
# print " $d1 should be about 20,001,600 meters.\n";
#
# $d1 = &distance( 90.0, 0.0, -90.0, 0.0 );
# # should be 1/2 of the circumference of the earth, which is
# # (1/2) * 40,003,200 meters = 20,001,600 meters.
# print " $d1 should be about 20,001,600 meters.\n";
#
# $d1 = &distance( 0.0, 0.0, 0.0, 0.000009 );
# # should be about 1 meter.
# print " $d1 should be about 1 meter.\n";
#
# $d1 = &distance( 0.000009, 0.0, 0.0, 0.000009 );
# # should be about sqrt(2) = 1.414 meter.
# print " $d1 should be about 1.414 meter.\n";



# Linear approximating function for the measured/graphed correlation between
# distance and MSL Trend:
#
#   D     Y   Y_norm    W
#  ---  ----  ------  ----
#    0  0.00   0.00   1.00
#  400  2.00   0.67   0.33
#  800  3.00   1.00   0.00
#
# Input is distance in km; output is a number between 0 and 1.
#
# Note that although the '400' and '800' km distances (for W=0.3333 and W=0.0,
# respectively) default to 400 and 800 km; they can be adjusted by program
# command-line parameters ('-knee=' and '-limit=', respectively), and they're
# also adjusted to a wide range of values if '-alldist' is specified.
sub dist_to_weight {
  local( $D ) = @_;
  local( $Y_norm, $W );
   if ($D <= 0.001) {
      $W = 1;  # at 1 meter or less, W=1
   } else {
      if ($D <= $kneeDist) {
         $Y = 2 * ($D/$kneeDist);
      }
      if (($D > $kneeDist) & ($D < $limitDist)) {
         $Y = 2 + (($D-$kneeDist)/($limitDist-$kneeDist));
      }
      if ($D >= $limitDist) {
         $Y = 3;
      }
      $Y_norm = $Y / 3;
      $W = 1 - $Y_norm;
   }
   return $W;
}


# trim off country names, for conciseness
sub city_name_only {
   local($name) = @_;
   $name =~ s/[\,\/].*$//;
   if ("Galveston Pier 21" eq $name) {
      $name = "Galveston";
   }
   return $name;
}


#   Inputs to this subroutine are three of the arrays which we read from the input
# .csv file: @rec_names, @rec_trend, @rec_years, and the number of records, $num_recs.
# Result is average MSL trend for the tide stations, weighted according to the number
# of years of operation.
sub duration_weighted_avg {
   local( @rec_total_change, $sum_total_change, $sum_years, $i, $result );
   $sum_total_change = $sum_years = 0.0;
   for ($i = 0; $i < $num_recs; $i++ ) {
      $rec_total_change[$i] = $rec_years[$i] * $rec_trend[$i];
      $sum_years += $rec_years[$i];
      $sum_total_change += $rec_total_change[$i];
   }
   $result = $sum_total_change / $sum_years;
   return $result;
}#duration_weighted_avg


#   Inputs to this subroutine are four of the global arrays which we read from
# the input .csv file: @rec_name, @rec_trend, @rec_lat, @rec_lon, and the number
# of records $num_recs.  Result is the geographically-weighted average MSL trend
# for the tide stations.
#   Also, if $debugmode >= 1, then a lot of debug info is written to stdout.
#
#   For each station location, check it against every other location, calculate the
# distance D between the two locations, and the correlation weight W calculated
# from the distance.  Then (unless W=0) add the name/distance/weight triple
# to the list of nearby stations for that station location.
#   Also, for each location, sum the correlation weights of the nearby locations.
#   For each station, calculate its own weight W_own (to a maximum of 1.0) from the
# summed weights of the nearby stations, according to the formula:
#   W_own = 1.0 / (1.0 + sum(nearby_weights))
# Thus, an isolated station has a weight of 1.0, but a station near other stations
# has a lower weight; e.g., a station which is very near just 1 other station has a
# weight of about 0.5.
#   The W_own weights are recorded in @rec_weight (a side-effect result).
#   Finally, calculate the geographically weighted average linear MSL trend from all
# 159 stations, and return it as the subroutine result.
#
#   Also, there's an optional input parameter, which is the name of a file to
# which the W_own weights are to be written in the form of a .csv file, along
# with the key fields from the input file.
sub dist_weighted_avg {
   local( $outpfile ) = @_;
   local( $name1, $name2, $dist, $weight );
   local( $nearbylist );
   local( $s1, $s2 );
   if (defined $outpfile) {
      if (!open(OUTP,">$outpfile")) {
         die "ERROR: could not create '$outpfile', $!\n";
      }
      # print a heading line for the output file:
      print OUTP '"Station_name",MSL_trend,"Latitude","Longitude",Weight' . "\n";
   }
   for ($s1 = 0; $s1 < $num_recs; $s1++) {
      $nearbyweight[$s1] = 0;
      $nearbylist = '';
      $name1 = &city_name_only( $rec_name[$s1] );  # trim off country names, for conciseness
      for ($s2 = 0; $s2 < $num_recs; $s2++) {
         if ($s2 != $s1) {
            $name2 = &city_name_only( $rec_name[$s2] );
            $dist = &distance( $rec_lat[$s1], $rec_lon[$s1], $rec_lat[$s2], $rec_lon[$s2] ) / 1000.0;  # divided by 1000 to convert meters to km
            if ($dist < $limitDist) {
               $weight = &dist_to_weight( $dist );
               $nearbyweight[$s1] += $weight;
               if ($debugmode > 0) {
                  $nearbylist .= sprintf( "(%s, d=%0.1f, w=%0.3f)", $name2, $dist, $weight ) . "; ";
               }
            }
         }
      }
      # this station has correlation weight 1.0; how much of the total is it for this location?
      $rec_weight[$s1] = 1.0 / (1.0 + $nearbyweight[$s1]);  # 1.0 for isolated tide stations, but as low as 0.07 for stations with many others nearby
      if ($debugmode > 0) {
         # for each station, display the nearby station weight sum W_near, this station's weight W_own, and the list of nearby stations with their distances and correlation weights:
         printf "dbg: $s1. $name1, MSLt=%0.3f, lat/lon=%0.2f/%0.2f, W_near=%0.3f, W_own=%0.3f, list = %s\n",
                $rec_trend[$s1], $rec_lat[$s1], $rec_lon[$s1], $nearbyweight[$s1], $rec_weight[$s1], $nearbylist;
         $nearbylist = '';
      }
      if (defined $outpfile) {
         print OUTP '"' . $rec_name[$s1] . '",' . $rec_trend[$s1] . ',"' . $rec_lat[$s1] . '","' . $rec_lon[$s1] . '",' . $rec_weight[$s1] . "\n";
      }
   }#for
   if (defined $outpfile) {
      close OUTP;
   }
   local($summed_weights) = 0.0;
   local($summed_weighted_trends) = 0.0;
   for ($s1 = 0; $s1 < $num_recs; $s1++) {
      $summed_weights += $rec_weight[$s1];
      $summed_weighted_trends += ($rec_weight[$s1] * $rec_trend[$s1]);
   }
   local($weighted_avg) = $summed_weighted_trends / $summed_weights;
   return $weighted_avg;
}#dist_weighted_avg


#   This is a companion function for &dist_weighted_avg, to calculate
# the +/- 95% confidence interval.  You must call &dist_weighted_avg
# before calling this function, because this function uses the @rec_weight
# array which &dist_weighted_avg creates.
#   Inputs to this subroutine are globals @rec_weight, @rec_ci, and the
# number of records $num_recs.  It returns four results:
#   First two results are the standard deviation and combined 95% confidence
# interval for the distance-weighted average MSL trend.  I.e., we can say
# with 95% confidence that the global average MSL trend is the result
# of &dist_weighted_avg +/- the 95% CI result.  The 3rd & 4th results
# are a similar concept but very different result: the SD and 95% CI
# for the LOCAL mean sea levels.  These are much larger numbers, because
# they include the variation in LMSL trend between tide stations, due to
# local factors such as Post-Glacial Rebound and local subsidence.
#
# I am very grateful to Dr. Gordon Simons for his expert advice...
#
# On Wed, Mar 17, 2010, Gordon Simons <gsimons@email.unc.edu> wrote:
#
# > David,
# >
# > A 95% confidence interval is the value plus or minus 1.96
# > times the standard deviation. So one can compute the standard
# > deviation for that value by dividing the TOTAL length of the
# > confidence interval by twice 1.96.
# >
# > If there is a valid reason to believe the observation are
# > independent of each other, then one can go on to compute a
# > confidence interval for the average, or for a weighted average.
# > If not, proceeding is going to give you nonsense. So let's
# > assume you have independence.
# >
# > Let x_1,...,x_n denote n numbers (values) with standard
# > deviations s_1,...,x_n respectively. Further, denote the
# > weights by w_1,...,w_n, respectively. What you seek is a
# > confidence interval for the sum of products:
# >
# >        y = w_1 x_1 + ... + w_n x_n.
# >
# > Here, you probably intend for the weights to add to unity,
# > but this does not matter for what follows.
# >
# > The standard deviation for this weighted average, call it t,
# > can be computed as the square root of the sum
# >
# >          w_1^2 s_1^2 + ,,, + w_n^2 s_n^2
# >
# > (Here, a^2 means the square of a.)
# >
# > The desired 95% confidence interval becomes y plus or minus 1.96 t.
#
sub dist_weighted_CI {
   local( $i, $SD1, $SD2, $CI1, $CI2, $product, $sum );

   # First, Gordie's way (gives SD & confidence interval for the avg MSL):
   $sum = 0.0;
   for ($i = 0; $i < $num_recs; $i++) {
      $SD = $rec_ci[$i] / 1.96;  # standard deviation
      $wt = $rec_weight[$i];
      $product = ($SD * $SD) * ($wt * $wt);
      $sum += $product;
      if ($debugmode > 0) {
         # for each station, display the CI, SD, and product:
         $name1 = &city_name_only( $rec_name[$i] );  # trim off country names, for conciseness
         printf "dbg: $i. $name1, MSLt=%0.3f, CI=%0.3f, SD=%0.3f, Weight=%0.3f, SD^2 * Wt^2 = %0.5f\n",
                $rec_trend[$i], $rec_ci[$i], $SD, $wt, $product;
      }
   }#for
   local($summed_weights) = 0.0;
   for ($i = 0; $i < $num_recs; $i++) {
      $summed_weights += $rec_weight[$i];
   }
   $SD1 = sqrt($sum) / $summed_weights;
   $CI1 = 1.96 * $SD1;

   # Second, via ANOVA (gives SD & CI for LMSL, because it includes variation between stations):
   for ($i = 0; $i < $num_recs; $i++) {
      $SDs[$i] = $rec_ci[$i] / 1.96;  # standard deviation
      $ncounts[$i] = 600 * $rec_weight[$i];  # 600 is arbitrary: 50 years x 12 months
   }
   $SD2 = &composite_SD( $num_recs, @rec_trend, @SDs, @ncounts );
   $CI2 = 1.96 * $SD2;

   if ($debugmode > 0) {
      printf "For distance-weighted avg MSL: SD=%7.4f, CI=%7.4f; ", $SD1, $CI1;
      printf "   for LMSL: SD=%7.4f, CI=%7.4f\n", $SD2, $CI2;
   }

   return ($SD1, $CI1, $SD2, $CI2);
}#dist_weighted_CI


print "Analyzing $num_recs records...\n";
printf "Median MSL trend        = %0.4f\n", &median( @rec_trend );
printf "Simple average (avg1)   = %0.4f\n", &avg( @rec_trend );
printf "Equal station-year avg2 = %0.4f\n", &duration_weighted_avg;

if ($opt_export) {
   $geoAvg = &dist_weighted_avg( 'geo_weights.csv' );
} else {
   $geoAvg = &dist_weighted_avg;
}
( $SD, $CI95pct, $SD2, $CI95pct2 ) = &dist_weighted_CI;

printf "Distance-weighted avg   = %0.4f +/- %0.4f  (SD=%0.4f", $geoAvg, $CI95pct, $SD;
if ($opt_lmsl) {
   printf ", LMSL SD=%0.2f", $SD2;
}
print ")   (for knee=$kneeDist km [\@ correlation weight 0.3333], and limit=$limitDist km)\n";
#
# Here's the result which that prints:
#    Distance-weighted avg   = 1.1330 +/- 0.0722  (SD=0.0368)   (for knee=400 km [@ correlation weight 0.3333], and limit=800 km)
#
# With the "-lmsl" option specified it prints:
#    Distance-weighted avg   = 1.1330 +/- 0.0722  (SD=0.0368, LMSL SD=2.03)   (for knee=400 km [@ correlation weight 0.3333], and limit=800 km)
#


# We can also weight stations by BOTH geographical distance and years of operation:
local($summed_weights) = 0.0;
local($summed_weighted_trends) = 0.0;
for ($i = 0; $i < $num_recs; $i++) {
   $summed_weights += ($rec_weight[$i] * $rec_years[$i]);
   $summed_weighted_trends += ($rec_weight[$i] * $rec_years[$i] * $rec_trend[$i]);
}
$dbl_weighted_avg = $summed_weighted_trends / $summed_weights;
printf "Avg weighted BOTH ways  = %0.4f\n", $dbl_weighted_avg;



# if you want debug prints for the rest, specify "-d" TWICE!
$debugmode--;


# Sanity/debug check:
#
# If this was calculated properly, then adding duplicate entries for any tide station
# will not much affect the result.
#
# In fact, it can't affect the result at all for isolated stations, like Guam (MSL
# trend -1.05 mm/yr), and Takoradi (MSL trend +3.35 mm/yr), which are more than 800 km
# from any other tide stations.
#
# For non-isolated stations (i.e., stations which are within 800 km of other stations),
# adding duplicate entries should have only a very small effect on the result.
#
# So, to sanity-check my code, here we examine the effect of adding duplicate station
# records, with four different stations: Guam, Takoradi, Furuogrund, and Galveston.
#
# As expected, the duplicate records do not affect the result at all for Guam or Takoradi,
# and affect it very little for Galveston and Furuogrund.  That gives confidence that
# the calculation was done correctly.
#

$true_num_recs = $num_recs;  # 159 (the number of GLOSS-LTT tide stations in NOAA's list) unless -first and/or -last were specified


# If '-duptest' is specified, we test what happens when some records are duplicated
# (this is just a sanity test; you shouldn't ever really need to do it, since I've already done it).
if ($opt_duptest) {

   print "\n'-duptest' (diagnostic/sanity check):\n";
   # Duplicate the Mumbai tide station, because it is >800 km from any other station
   $Mumbai_indx = -1;
   for ($j=0; $j < $num_recs; $j++) {
      if ('Mumbai' eq substr($rec_name[$j],0,6)) {
         $Mumbai_indx = $j;
      }
   }
   if ($Mumbai_indx != -1) {
      # Add 30 duplicate Mumbai tide stations to the data set:
      $cityname = &city_name_only( $rec_name[$Mumbai_indx] ); # 'Mumbai'
      for ($num_recs = $true_num_recs; $num_recs < (29+$true_num_recs); $num_recs++) {
         $rec_trend[$num_recs] = $rec_trend[$Mumbai_indx];
         $rec_ci   [$num_recs] = $rec_ci   [$Mumbai_indx];
         $rec_years[$num_recs] = $rec_years[$Mumbai_indx];
         $rec_lat  [$num_recs] = $rec_lat  [$Mumbai_indx];
         $rec_lon  [$num_recs] = $rec_lon  [$Mumbai_indx];
         $rec_name [$num_recs] = $cityname . ($num_recs + 1 - $true_num_recs);
      }
      $geoAvg = &dist_weighted_avg;
      ( $SD, $CI95pct, $SD2, $CI95pct2 ) = &dist_weighted_CI;
      printf "Distance-weighted avg   = %0.4f +/- %0.4f  (debug check with 30 extra ${cityname}s)\n", $geoAvg, $CI95pct;
      $num_recs = $true_num_recs + 1;
      $geoAvg = &dist_weighted_avg;
      ( $SD, $CI95pct, $SD2, $CI95pct2 ) = &dist_weighted_CI;
      printf "Distance-weighted avg   = %0.4f +/- %0.4f  (debug check with 1 extra $cityname)\n", $geoAvg, $CI95pct;
   }

   # Replace the dulicate stations with duplicate Reykjavik (if unsorted) or Furuogrund (if sorted) tide stations:
   $cityname = &city_name_only( $rec_name[0] );
   for ($num_recs = $true_num_recs; $num_recs < (29+$true_num_recs); $num_recs++) {
      $rec_trend[$num_recs] = $rec_trend[0];
      $rec_ci   [$num_recs] = $rec_ci   [0];
      $rec_years[$num_recs] = $rec_years[0];
      $rec_lat  [$num_recs] = $rec_lat  [0];
      $rec_lon  [$num_recs] = $rec_lon  [0];
      $rec_name [$num_recs] = $cityname . ($num_recs + 1 - $true_num_recs);
   }
   $geoAvg = &dist_weighted_avg;
   ( $SD, $CI95pct, $SD2, $CI95pct2 ) = &dist_weighted_CI;
   printf "Distance-weighted avg   = %0.4f +/- %0.4f  (debug check with 30 extra ${cityname}s)\n", $geoAvg, $CI95pct;
   $num_recs = $true_num_recs + 1;
   $geoAvg = &dist_weighted_avg;
   ( $SD, $CI95pct, $SD2, $CI95pct2 ) = &dist_weighted_CI;
   printf "Distance-weighted avg   = %0.4f +/- %0.4f  (debug check with 1 extra $cityname)\n", $geoAvg, $CI95pct;
   $num_recs = $true_num_recs;
}


# But what if my aproximation function for the observed MSL Trend difference
# vs. distance wasn't quite right?  How much difference could that make?
#
# To answer that question, if '-alldist' is specified then we'll repeat the
# calculations with 48 different correlation curve approximations.  For
# simplicity, I again just use a three-segment piecewise-linear aproximating
# function, but vary the "knee" & "limit" points over a wide range.
#
# Our best estimate of the correct knee & limit are 400 & 800 km, respectively.
# For the semi-reasonable range of knee = 250..600, test at smaller intervals.
# Outside that range, test at larger intervals.
#
if ($opt_alldist) {
   print "\n";
   print "--------------------------------------------------------------\n";
   # Test basically all possible correlation distances:
   $mult = 3;  # alternate between testing w/ limit = 2x knee and limit = 3x knee
   for ($kneeDist = 0; $kneeDist <= 10000;) {
      $limitDist = $mult * $kneeDist;
      $geoAvg = &dist_weighted_avg;  # global avg MSL trend, calculated by weighting LMSLs by geographical distances
      ( $SD, $CI95pct, $SD2, $CI95pct2 ) = &dist_weighted_CI;  # calculate two kinds of standard deviations and confidence intervals

      printf "Distance-weighted avg   = %0.4f +/- %0.4f CI=[%0.4f..%0.4f] (SD=%0.4f) ",
             $geoAvg, $CI95pct, ($geoAvg - $CI95pct), ($geoAvg + $CI95pct), $SD;
      if ($opt_lmsl) {
         printf " (LMSL SD=%0.2f CI=[%0.2f..%0.2f]) ", $SD2, ($geoAvg - $CI95pct2), ($geoAvg + $CI95pct2);
      }
      print " for knee=$kneeDist \& limit=$limitDist km\n";

      if (2 == $mult) {
         $mult = 3;
      } else {
         $mult = 2;
         if ($kneeDist >= 3000) {
            $kneeDist += 500;
         } elsif ($kneeDist >= 2000) {
            $kneeDist += 250;
         } elsif ($kneeDist >= 1000) {
            $kneeDist += 200;
         } elsif ($kneeDist >= 700) {
            $kneeDist += 100;
         } elsif ($kneeDist >= 600) {
            $kneeDist += 50;
         } elsif ($kneeDist >= 250) {
            $kneeDist += 25;
         } else {
            $kneeDist += 50;
         }
      }
   }#for
}


exit 0;

__END__


