#!/usr/bin/perl

# Subroutine to calculate the coefficients of a least-squares best-fit
# line to a list of X,Y points.
#
# (Written for Perl 5, but compatible with Perl 4 if you delete the "use bignum"
# and "no bignum" directives.)
#
# Copyright 2010, by David A. Burton, 2010
# Cary, NC  USA
# +1-919-481-0149
# Email: http://www.burtonsys.com/email/
#
# You can use a "require" to pull this into another Perl program, like this:
#
#   unshift( @INC, '.' );
#   require "linear_fit.pl";

# TLIB Version Control fills in the version information for us:
$version_str = "";
#--=>keyflag<=-- "&(#)%n, version %v, %d "
$version_str = "&(#)linear_fit.pl, version 7, 27-May-10 ";


$| = 1;   # predefined variable. If <> 0 then each print to the console
          # will immediatly be displayed, instead of buffered.


unshift( @INC, '.' );  # make sure we fetch it from the current folder
require "detect_do_or_require.pl";
shift( @INC );  # restore


# "use bignum" for sub linear_fit
use bignum;  # See http://perldoc.perl.org/bignum.html

unshift( @INC, '.' );  # make sure we fetch this module from the current folder
require "twounknowns.pl";
shift( @INC );  # restore


# least-squares fit of a line ("linear regression")
# Input is a series of at least three X,Y pairs, which define the point coordinates to be fit
# Returns a 3-element list, consisting of the slope M, the y-intercept B, and correlation R
#
# Gratefully adapted from  http://www.pgccphy.net/Linreg/linreg_pl.txt
# by Dr. David G. Simpson, Department of Physical Science,
# Prince George's Community College, Largo, Maryland  20774
sub linear_fit {
   local( $x, $y, $cntr, $sumx, $sumx2, $sumxy, $sumy, $sumy2, $m, $b, $r );
   local( $n ) = int((1+$#_) / 2);
   if (((2*$n)-1) != $#_) {
      die "ERR: odd number of values passed to sub linear_fit\n";
   }
   if ($debugmode) {
      print "linear_fit( $n XY pairs ):\n";
   }
   $sumx = $sumx2 = $sumxy = $sumy = $sumy2 = $cntr = 0;
   while ($#_ > 0) {
      $cntr++;
      $x = shift @_;
      $y = shift @_;
      $sumx  += $x;       # compute sum of x
      $sumx2 += $x * $x;  # compute sum of x**2
      $sumxy += $x * $y;  # compute sum of x * y
      $sumy  += $y;       # compute sum of y
      $sumy2 += $y * $y;  # compute sum of y**2
   }
   if ($n != $cntr) {
      die "ERR: linear_fit, n != cntr\n";
   }

   $m = ($n * $sumxy  -  $sumx * $sumy) / ($n * $sumx2 - ($sumx * $sumx));      # compute slope
   $b = ($sumy*$sumx2 - $sumx*$sumxy) / ($n*$sumx2 - ($sumx*$sumx));      # compute y-intercept
   $r = ($sumxy - $sumx * $sumy / $n) /                       # compute correlation coefficient
            sqrt(($sumx2 - ($sumx * $sumx)/$n) * ($sumy2 - ($sumy * $sumy)/$n));
   if ($debugmode) {
      printf "  Slope        m = %13.6f\n", $m;
      printf "  y-intercept  b = %13.6f\n", $b;
      printf "  Correlation  r = %13.6f\n", $r;
   }

   local($m2,$b2) = &two_unknowns( $sumx2,$sumx,$sumxy, $sumx,$cntr,$sumy );
   # print "dbg: (sub linear_fit) m=$m=$m2  b=$b=$b2\n";

   return ($m, $b, $r);
}

no bignum;


# if invoked from command line, print an error message
if (! &invoked_via_do_or_require) {
   print "$0 is intended to be loaded via 'require'.  See comments\n" .
         "in the source code file for instructions.\n";
   exit 1;
}


1;

__END__





Given a scatter plot of X and Y cordinates, how does one find the
best-fitting line?

Let's say that Y = m*X + b is the best fit line.

  m=slope    b=Y-intercept

How does one find m and b for the best fit line?


- - - - - - - - - - - - - - - - - - - - - - - -

Suppose the points you are given are {(X[i],Y[i]): 1 <= i <= N}.
Then you want the values of m and b that minimize the sum of
squares of the deviations of Y[i] from the line, m*X[i] + b.
They will give you the best-fitting linear equation.

Let the sum of the squares of the deviations be

             N
   F(m,b) = SUM (m*X[i] + b - Y[i])^2.
            i=1

To minimize this, take partial derivatives of F with respect to
the two variables, m and b, set both equal to zero, and solve
simultaneously:

   dF/dm = SUM 2*(m*X[i] + b - Y[i])*X[i] = 0,
   dF/db = SUM 2*(m*X[i] + b - Y[i]) = 0.

(Here all sums range over i = 1, 2, ..., N.)  Dividing by 2 and
rearranging, you can see that these are two simultaneous linear
equations in the two unknowns m and b:

   (SUM X[i]^2)*m + (SUM X[i])*b = SUM X[i]*Y[i],
   (SUM X[i])*m  +  (SUM 1)*b    = SUM Y[i].


So Let's solve the 2 equations.

First, note that (SUM 1) = N

For simplicity of notation, let:

   e=(SUM X[i]^2)
   f=(SUM X[i])
   g=(SUM X[i]*Y[i])
   h=(SUM Y[i])
   N=(SUM 1)

Then the 2 equations are in standard form (except that the m & b which
we're solving for, above, I'm going to now call x & y):

   e*x + f*y = g
   f*x + N*y = h

   (x,y) = &two_unknowns( e,f,g, f,N,h );

And here's the Perl code:

   $e = $f = $g = $h = $N = 0;
   while ($#_ > 0) {
      $N++;
      $x = shift @_;
      $y = shift @_;
      $f += $x;          # a/k/a $sumx
      $e += ($x * $x);   # a/k/a $sumx2
      $h += $y;          # a/k/a $sumy
      $g += ($x * $y);   # a/k/a $sumxy
   }
   ($x,$y) = &two_unknowns( $e,$f,$g, $f,$N,$h );

Or, reverting to the m,b notation, above

   ($m,$b) = &two_unknowns( $e,$f,$g, $f,$N,$h );
   ($m,$b) = &two_unknowns( $sumx2,$sumx,$sumxy, $sumx,$N,$sumy );

