#!/usr/bin/perl

# Subroutine to calculate the coefficients of a least-squares best-fit
# line to a list of X,Y points and weights.
#
# (Written for Perl 5, but compatible with Perl 4 if you delete the "use bignum"
# and "no bignum" directives.)
#
# Copyright 2010, by David A. Burton, 2010
# Cary, NC  USA
# +1-919-481-0149
# Email: http://www.burtonsys.com/email/
#
# You can use a "require" to pull this into another Perl program, like this:
#
#   unshift( @INC, '.' );
#   require "weighted_linear_fit.pl";

# TLIB Version Control fills in the version information for us:
$version_str = "";
#--=>keyflag<=-- "&(#)%n, version %v, %d "
$version_str = "&(#)weighted_linear_fit.pl, version 5, 28-May-10 ";


$| = 1;   # predefined variable. If <> 0 then each print to the console
          # will immediatly be displayed, instead of buffered.


unshift( @INC, '.' );  # make sure we fetch it from the current folder
require "detect_do_or_require.pl";
shift( @INC );  # restore


# "use bignum" for sub linear_fit
use bignum;  # See http://perldoc.perl.org/bignum.html

unshift( @INC, '.' );  # make sure we fetch this module from the current folder
require "twounknowns.pl";
shift( @INC );  # restore


# Least-squares fit of a line ("linear regression") to a set of weighted points.
# Input is three arrays of identical length: @x (the X-coordinates), @y (the
# Y-coordinates), and @w (the weights).
# Returns a 2-element list, consisting of the slope M, and the y-intercept B.
sub weighted_linear_fit {
   local( @x, @y, @w, $x, $y, $w, $N, $sumxw, $sumx2w, $sumxyw, $sumyw, $sumy2, $m, $b, $r, $i );
   $N = 1+$#_;
   if ($N % 3) {
      die "ERR: number of values passed to sub weighted_linear_fit not divisible by 3\n";
   }
   $N = int(0.001+($N/3));
   if ($debugmode) {
      print "weighted_linear_fit( $N X,Y,W triples ):\n";
   }
   @x = splice( @_, 0, $N );
   @y = splice( @_, 0, $N );
   @w = @_;
   $sumxw = $sumx2w = $sumxyw = $sumyw = $sumw = 0;
  # $sumy2 = 0;
   for ($i=0; $i < $N; $i++) {
      $x = $x[$i];
      $y = $y[$i];
      $w = $w[$i];
      $sumxw  += $x * $w;       # compute sum of x * w
      $sumx2w += $x * $x * $w;  # compute sum of x**2 * w
      $sumxyw += $x * $y * $w;  # compute sum of x * y * w
      $sumyw  += $y * $w;       # compute sum of y * w
      $sumw   += $w;            # compute sum of weights
  #    $sumy2 += $y * $y;  # compute sum of y**2
   }

   $m = ($sumw * $sumxyw  -  $sumxw * $sumyw) / ($sumw * $sumx2w - ($sumxw * $sumxw));      # compute slope
   $b = ($sumyw*$sumx2w - $sumxw*$sumxyw) / ($sumw*$sumx2w - ($sumxw*$sumxw));      # compute y-intercept
  # $r = ($sumxyw - $sumxw * $sumyw / $sumw) /                       # compute correlation coefficient
  #          sqrt(($sumx2w - ($sumxw * $sumxw)/$sumw) * ($sumy2 - ($sumyw * $sumyw)/$sumw));
   if ($debugmode) {
      printf "  Slope        m = %13.6f\n", $m;
      printf "  y-intercept  b = %13.6f\n", $b;
  #    printf "  Correlation  r = %13.6f\n", $r;
   }

   local($m2,$b2) = &two_unknowns( $sumx2w,$sumxw,$sumxyw, $sumxw,$sumw,$sumyw );
   # print "dbg: (sub weighted_linear_fit) m=$m=$m2  b=$b=$b2\n";

   return ($m, $b);
}

no bignum;


# if invoked from command line, print an error message
if (! &invoked_via_do_or_require) {
   print "$0 is intended to be loaded via 'require'.  See comments\n" .
         "in the source code file for instructions.\n";
   exit 1;
}


1;

__END__





Given a scatter plot of X and Y cordinates, with weights for
each one, how does one find the best-fitting line.

Let's say that Y = m*X + b is the best fit line.

  m=slope    b=Y-intercept

How does one find m and b for the best fit line?


- - - - - - - - - - - - - - - - - - - - - - - -

Suppose the points you are given are {(X[i],Y[i]): 1 <= i <= N}.
Also suppose the weights are W[i].

Then you want the values of m and b that minimize the weighted sum
squares of the deviations of Y[i] from the line, m*X[i] + b.
They will give you the best-fitting linear equation.

Let the sum of the squares of the deviations be

             N
   F(m,b) = SUM (W[i] * (m*X[i] + b - Y[i])^2)
            i=1

To minimize this, take partial derivatives of F with respect to
the two variables, m and b, set both equal to zero, and solve
simultaneously.

The derivative of a sum is the sum of the derivatives of the
summed terms, and the derivative of a product is, by the
"product rule," as follows:

   d/dx F(x)G(x) = F(x)G'(x) * F'(x)G(x)

The derivative of a constant like W[i] is zero, so when we take
the partial derivatives of F(m,b) w/r/t m and b and set them to
zero we have:

   dF/dm = SUM 2*(m*X[i] + b - Y[i])*X[i]*W[i] = 0
   dF/dC = SUM 2*(m*X[i] + b - Y[i])*W[i] = 0

(Here all sums range over i = 1, 2, ..., N.)  Dividing by 2 and
rearranging, you can see that these are two simultaneous linear
equations in the two unknowns m and b:

   (SUM X[i]^2*W[i])*m + (SUM X[i]*W[i])*b = SUM X[i]*Y[i]*W[i],
   (SUM X[i]*W[i])*m  +  (SUM W[i])*b    = SUM Y[i]*W[i].


So Let's solve the 2 equations.

For simplicity of notation, let:

   e=(SUM X[i]^2*W[i])
   f=(SUM X[i]*W[i])
   g=(SUM X[i]*Y[i]*W[i])
   h=(SUM Y[i]*W[i])
   k=(SUM W[i])

Then the 2 equations are in standard form (except that the m & b which
we're solving for, above, I'm going to now call x & y):

   e*x + f*y = g
   f*x + k*y = h

   (x,y) = &two_unknowns( e,f,g, f,k,h );

And here's the Perl code:

   $e = $f = $g = $h = $k = $N = 0;
   $N = 1 + $#x;
   for ($i = 0; $i <= $#x; $i++) {
      $N++;
      $x = $x[$i];
      $y = $y[$i];
      $w = $w[$i];
      $f += $x * $w;       # a/k/a $sumxw
      $e += $x * $x * $w;  # a/k/a $sumx2w
      $h += $y * $w;       # a/k/a $sumyw
      $g += $x * $y * $w;  # a/k/a $sumxyw
      $k += $w;            # a/k/a $sumw
   }
   ($x,$y) = &two_unknowns( $e,$f,$g, $f,$k,$h );

Or, reverting to the m,b notation, above

   ($m,$b) = &two_unknowns( $e,$f,$g, $f,$k,$h );
   ($m,$b) = &two_unknowns( $sumx2w,$sumxw,$sumxyw, $sumxw,$sumw,$sumyw );

