Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.stat.regression;
019import java.io.Serializable;
020
021import org.apache.commons.math3.distribution.TDistribution;
022import org.apache.commons.math3.exception.MathIllegalArgumentException;
023import org.apache.commons.math3.exception.NoDataException;
024import org.apache.commons.math3.exception.OutOfRangeException;
025import org.apache.commons.math3.exception.util.LocalizedFormats;
026import org.apache.commons.math3.util.FastMath;
027import org.apache.commons.math3.util.Precision;
028
029/**
030 * Estimates an ordinary least squares regression model
031 * with one independent variable.
032 * <p>
033 * <code> y = intercept + slope * x  </code></p>
034 * <p>
035 * Standard errors for <code>intercept</code> and <code>slope</code> are
036 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
037 * <p>
038 * Observations (x,y pairs) can be added to the model one at a time or they
039 * can be provided in a 2-dimensional array.  The observations are not stored
040 * in memory, so there is no limit to the number of observations that can be
041 * added to the model.</p>
042 * <p>
043 * <strong>Usage Notes</strong>: <ul>
044 * <li> When there are fewer than two observations in the model, or when
045 * there is no variation in the x values (i.e. all x values are the same)
046 * all statistics return <code>NaN</code>. At least two observations with
047 * different x coordinates are required to estimate a bivariate regression
048 * model.
049 * </li>
050 * <li> Getters for the statistics always compute values based on the current
051 * set of observations -- i.e., you can get statistics, then add more data
052 * and get updated statistics without using a new instance.  There is no
053 * "compute" method that updates all statistics.  Each of the getters performs
054 * the necessary computations to return the requested statistic.
055 * </li>
056 * <li> The intercept term may be suppressed by passing {@code false} to
057 * the {@link #SimpleRegression(boolean)} constructor.  When the
058 * {@code hasIntercept} property is false, the model is estimated without a
059 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
060 * </ul></p>
061 *
062 */
063public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
064
065    /** Serializable version identifier */
066    private static final long serialVersionUID = -3004689053607543335L;
067
068    /** sum of x values */
069    private double sumX = 0d;
070
071    /** total variation in x (sum of squared deviations from xbar) */
072    private double sumXX = 0d;
073
074    /** sum of y values */
075    private double sumY = 0d;
076
077    /** total variation in y (sum of squared deviations from ybar) */
078    private double sumYY = 0d;
079
080    /** sum of products */
081    private double sumXY = 0d;
082
083    /** number of observations */
084    private long n = 0;
085
086    /** mean of accumulated x values, used in updating formulas */
087    private double xbar = 0;
088
089    /** mean of accumulated y values, used in updating formulas */
090    private double ybar = 0;
091
092    /** include an intercept or not */
093    private final boolean hasIntercept;
094    // ---------------------Public methods--------------------------------------
095
096    /**
097     * Create an empty SimpleRegression instance
098     */
099    public SimpleRegression() {
100        this(true);
101    }
102    /**
103    * Create a SimpleRegression instance, specifying whether or not to estimate
104    * an intercept.
105    *
106    * <p>Use {@code false} to estimate a model with no intercept.  When the
107    * {@code hasIntercept} property is false, the model is estimated without a
108    * constant term and {@link #getIntercept()} returns {@code 0}.</p>
109    *
110    * @param includeIntercept whether or not to include an intercept term in
111    * the regression model
112    */
113    public SimpleRegression(boolean includeIntercept) {
114        super();
115        hasIntercept = includeIntercept;
116    }
117
118    /**
119     * Adds the observation (x,y) to the regression data set.
120     * <p>
121     * Uses updating formulas for means and sums of squares defined in
122     * "Algorithms for Computing the Sample Variance: Analysis and
123     * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
124     * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
125     * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
126     *
127     *
128     * @param x independent variable value
129     * @param y dependent variable value
130     */
131    public void addData(final double x,final double y) {
132        if (n == 0) {
133            xbar = x;
134            ybar = y;
135        } else {
136            if( hasIntercept ){
137                final double fact1 = 1.0 + n;
138                final double fact2 = n / (1.0 + n);
139                final double dx = x - xbar;
140                final double dy = y - ybar;
141                sumXX += dx * dx * fact2;
142                sumYY += dy * dy * fact2;
143                sumXY += dx * dy * fact2;
144                xbar += dx / fact1;
145                ybar += dy / fact1;
146            }
147         }
148        if( !hasIntercept ){
149            sumXX += x * x ;
150            sumYY += y * y ;
151            sumXY += x * y ;
152        }
153        sumX += x;
154        sumY += y;
155        n++;
156    }
157
158    /**
159     * Appends data from another regression calculation to this one.
160     *
161     * <p>The mean update formulae are based on a paper written by Philippe
162     * P&eacute;bay:
163     * <a
164     * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
165     * Formulas for Robust, One-Pass Parallel Computation of Covariances and
166     * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
167     * SAND2008-6212, Sandia National Laboratories.</p>
168     *
169     * @param reg model to append data from
170     * @since 3.3
171     */
172    public void append(SimpleRegression reg) {
173        if (n == 0) {
174            xbar = reg.xbar;
175            ybar = reg.ybar;
176            sumXX = reg.sumXX;
177            sumYY = reg.sumYY;
178            sumXY = reg.sumXY;
179        } else {
180            if (hasIntercept) {
181                final double fact1 = reg.n / (double) (reg.n + n);
182                final double fact2 = n * reg.n / (double) (reg.n + n);
183                final double dx = reg.xbar - xbar;
184                final double dy = reg.ybar - ybar;
185                sumXX += reg.sumXX + dx * dx * fact2;
186                sumYY += reg.sumYY + dy * dy * fact2;
187                sumXY += reg.sumXY + dx * dy * fact2;
188                xbar += dx * fact1;
189                ybar += dy * fact1;
190            }else{
191                sumXX += reg.sumXX;
192                sumYY += reg.sumYY;
193                sumXY += reg.sumXY;
194            }
195        }
196        sumX += reg.sumX;
197        sumY += reg.sumY;
198        n += reg.n;
199    }
200
201    /**
202     * Removes the observation (x,y) from the regression data set.
203     * <p>
204     * Mirrors the addData method.  This method permits the use of
205     * SimpleRegression instances in streaming mode where the regression
206     * is applied to a sliding "window" of observations, however the caller is
207     * responsible for maintaining the set of observations in the window.</p>
208     *
209     * The method has no effect if there are no points of data (i.e. n=0)
210     *
211     * @param x independent variable value
212     * @param y dependent variable value
213     */
214    public void removeData(final double x,final double y) {
215        if (n > 0) {
216            if (hasIntercept) {
217                final double fact1 = n - 1.0;
218                final double fact2 = n / (n - 1.0);
219                final double dx = x - xbar;
220                final double dy = y - ybar;
221                sumXX -= dx * dx * fact2;
222                sumYY -= dy * dy * fact2;
223                sumXY -= dx * dy * fact2;
224                xbar -= dx / fact1;
225                ybar -= dy / fact1;
226            } else {
227                final double fact1 = n - 1.0;
228                sumXX -= x * x;
229                sumYY -= y * y;
230                sumXY -= x * y;
231                xbar -= x / fact1;
232                ybar -= y / fact1;
233            }
234             sumX -= x;
235             sumY -= y;
236             n--;
237        }
238    }
239
240    /**
241     * Adds the observations represented by the elements in
242     * <code>data</code>.
243     * <p>
244     * <code>(data[0][0],data[0][1])</code> will be the first observation, then
245     * <code>(data[1][0],data[1][1])</code>, etc.</p>
246     * <p>
247     * This method does not replace data that has already been added.  The
248     * observations represented by <code>data</code> are added to the existing
249     * dataset.</p>
250     * <p>
251     * To replace all data, use <code>clear()</code> before adding the new
252     * data.</p>
253     *
254     * @param data array of observations to be added
255     * @throws ModelSpecificationException if the length of {@code data[i]} is not
256     * greater than or equal to 2
257     */
258    public void addData(final double[][] data) throws ModelSpecificationException {
259        for (int i = 0; i < data.length; i++) {
260            if( data[i].length < 2 ){
261               throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
262                    data[i].length, 2);
263            }
264            addData(data[i][0], data[i][1]);
265        }
266    }
267
268    /**
269     * Adds one observation to the regression model.
270     *
271     * @param x the independent variables which form the design matrix
272     * @param y the dependent or response variable
273     * @throws ModelSpecificationException if the length of {@code x} does not equal
274     * the number of independent variables in the model
275     */
276    public void addObservation(final double[] x,final double y)
277    throws ModelSpecificationException {
278        if( x == null || x.length == 0 ){
279            throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
280        }
281        addData( x[0], y );
282    }
283
284    /**
285     * Adds a series of observations to the regression model. The lengths of
286     * x and y must be the same and x must be rectangular.
287     *
288     * @param x a series of observations on the independent variables
289     * @param y a series of observations on the dependent variable
290     * The length of x and y must be the same
291     * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
292     * the length of {@code y} or does not contain sufficient data to estimate the model
293     */
294    public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
295        if ((x == null) || (y == null) || (x.length != y.length)) {
296            throw new ModelSpecificationException(
297                  LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
298                  (x == null) ? 0 : x.length,
299                  (y == null) ? 0 : y.length);
300        }
301        boolean obsOk=true;
302        for( int i = 0 ; i < x.length; i++){
303            if( x[i] == null || x[i].length == 0 ){
304                obsOk = false;
305            }
306        }
307        if( !obsOk ){
308            throw new ModelSpecificationException(
309                  LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
310                  0, 1);
311        }
312        for( int i = 0 ; i < x.length ; i++){
313            addData( x[i][0], y[i] );
314        }
315    }
316
317    /**
318     * Removes observations represented by the elements in <code>data</code>.
319      * <p>
320     * If the array is larger than the current n, only the first n elements are
321     * processed.  This method permits the use of SimpleRegression instances in
322     * streaming mode where the regression is applied to a sliding "window" of
323     * observations, however the caller is responsible for maintaining the set
324     * of observations in the window.</p>
325     * <p>
326     * To remove all data, use <code>clear()</code>.</p>
327     *
328     * @param data array of observations to be removed
329     */
330    public void removeData(double[][] data) {
331        for (int i = 0; i < data.length && n > 0; i++) {
332            removeData(data[i][0], data[i][1]);
333        }
334    }
335
336    /**
337     * Clears all data from the model.
338     */
339    public void clear() {
340        sumX = 0d;
341        sumXX = 0d;
342        sumY = 0d;
343        sumYY = 0d;
344        sumXY = 0d;
345        n = 0;
346    }
347
348    /**
349     * Returns the number of observations that have been added to the model.
350     *
351     * @return n number of observations that have been added.
352     */
353    public long getN() {
354        return n;
355    }
356
357    /**
358     * Returns the "predicted" <code>y</code> value associated with the
359     * supplied <code>x</code> value,  based on the data that has been
360     * added to the model when this method is activated.
361     * <p>
362     * <code> predict(x) = intercept + slope * x </code></p>
363     * <p>
364     * <strong>Preconditions</strong>: <ul>
365     * <li>At least two observations (with at least two different x values)
366     * must have been added before invoking this method. If this method is
367     * invoked before a model can be estimated, <code>Double,NaN</code> is
368     * returned.
369     * </li></ul></p>
370     *
371     * @param x input <code>x</code> value
372     * @return predicted <code>y</code> value
373     */
374    public double predict(final double x) {
375        final double b1 = getSlope();
376        if (hasIntercept) {
377            return getIntercept(b1) + b1 * x;
378        }
379        return b1 * x;
380    }
381
382    /**
383     * Returns the intercept of the estimated regression line, if
384     * {@link #hasIntercept()} is true; otherwise 0.
385     * <p>
386     * The least squares estimate of the intercept is computed using the
387     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
388     * The intercept is sometimes denoted b0.</p>
389     * <p>
390     * <strong>Preconditions</strong>: <ul>
391     * <li>At least two observations (with at least two different x values)
392     * must have been added before invoking this method. If this method is
393     * invoked before a model can be estimated, <code>Double,NaN</code> is
394     * returned.
395     * </li></ul></p>
396     *
397     * @return the intercept of the regression line if the model includes an
398     * intercept; 0 otherwise
399     * @see #SimpleRegression(boolean)
400     */
401    public double getIntercept() {
402        return hasIntercept ? getIntercept(getSlope()) : 0.0;
403    }
404
405    /**
406     * Returns true if the model includes an intercept term.
407     *
408     * @return true if the regression includes an intercept; false otherwise
409     * @see #SimpleRegression(boolean)
410     */
411    public boolean hasIntercept() {
412        return hasIntercept;
413    }
414
415    /**
416    * Returns the slope of the estimated regression line.
417    * <p>
418    * The least squares estimate of the slope is computed using the
419    * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
420    * The slope is sometimes denoted b1.</p>
421    * <p>
422    * <strong>Preconditions</strong>: <ul>
423    * <li>At least two observations (with at least two different x values)
424    * must have been added before invoking this method. If this method is
425    * invoked before a model can be estimated, <code>Double.NaN</code> is
426    * returned.
427    * </li></ul></p>
428    *
429    * @return the slope of the regression line
430    */
431    public double getSlope() {
432        if (n < 2) {
433            return Double.NaN; //not enough data
434        }
435        if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
436            return Double.NaN; //not enough variation in x
437        }
438        return sumXY / sumXX;
439    }
440
441    /**
442     * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
443     * sum of squared errors</a> (SSE) associated with the regression
444     * model.
445     * <p>
446     * The sum is computed using the computational formula</p>
447     * <p>
448     * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
449     * <p>
450     * where <code>SYY</code> is the sum of the squared deviations of the y
451     * values about their mean, <code>SXX</code> is similarly defined and
452     * <code>SXY</code> is the sum of the products of x and y mean deviations.
453     * </p><p>
454     * The sums are accumulated using the updating algorithm referenced in
455     * {@link #addData}.</p>
456     * <p>
457     * The return value is constrained to be non-negative - i.e., if due to
458     * rounding errors the computational formula returns a negative result,
459     * 0 is returned.</p>
460     * <p>
461     * <strong>Preconditions</strong>: <ul>
462     * <li>At least two observations (with at least two different x values)
463     * must have been added before invoking this method. If this method is
464     * invoked before a model can be estimated, <code>Double,NaN</code> is
465     * returned.
466     * </li></ul></p>
467     *
468     * @return sum of squared errors associated with the regression model
469     */
470    public double getSumSquaredErrors() {
471        return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
472    }
473
474    /**
475     * Returns the sum of squared deviations of the y values about their mean.
476     * <p>
477     * This is defined as SSTO
478     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
479     * <p>
480     * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
481     *
482     * @return sum of squared deviations of y values
483     */
484    public double getTotalSumSquares() {
485        if (n < 2) {
486            return Double.NaN;
487        }
488        return sumYY;
489    }
490
491    /**
492     * Returns the sum of squared deviations of the x values about their mean.
493     *
494     * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
495     *
496     * @return sum of squared deviations of x values
497     */
498    public double getXSumSquares() {
499        if (n < 2) {
500            return Double.NaN;
501        }
502        return sumXX;
503    }
504
505    /**
506     * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
507     *
508     * @return sum of cross products
509     */
510    public double getSumOfCrossProducts() {
511        return sumXY;
512    }
513
514    /**
515     * Returns the sum of squared deviations of the predicted y values about
516     * their mean (which equals the mean of y).
517     * <p>
518     * This is usually abbreviated SSR or SSM.  It is defined as SSM
519     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
520     * <p>
521     * <strong>Preconditions</strong>: <ul>
522     * <li>At least two observations (with at least two different x values)
523     * must have been added before invoking this method. If this method is
524     * invoked before a model can be estimated, <code>Double.NaN</code> is
525     * returned.
526     * </li></ul></p>
527     *
528     * @return sum of squared deviations of predicted y values
529     */
530    public double getRegressionSumSquares() {
531        return getRegressionSumSquares(getSlope());
532    }
533
534    /**
535     * Returns the sum of squared errors divided by the degrees of freedom,
536     * usually abbreviated MSE.
537     * <p>
538     * If there are fewer than <strong>three</strong> data pairs in the model,
539     * or if there is no variation in <code>x</code>, this returns
540     * <code>Double.NaN</code>.</p>
541     *
542     * @return sum of squared deviations of y values
543     */
544    public double getMeanSquareError() {
545        if (n < 3) {
546            return Double.NaN;
547        }
548        return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
549    }
550
551    /**
552     * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
553     * Pearson's product moment correlation coefficient</a>,
554     * usually denoted r.
555     * <p>
556     * <strong>Preconditions</strong>: <ul>
557     * <li>At least two observations (with at least two different x values)
558     * must have been added before invoking this method. If this method is
559     * invoked before a model can be estimated, <code>Double,NaN</code> is
560     * returned.
561     * </li></ul></p>
562     *
563     * @return Pearson's r
564     */
565    public double getR() {
566        double b1 = getSlope();
567        double result = FastMath.sqrt(getRSquare());
568        if (b1 < 0) {
569            result = -result;
570        }
571        return result;
572    }
573
574    /**
575     * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
576     * coefficient of determination</a>,
577     * usually denoted r-square.
578     * <p>
579     * <strong>Preconditions</strong>: <ul>
580     * <li>At least two observations (with at least two different x values)
581     * must have been added before invoking this method. If this method is
582     * invoked before a model can be estimated, <code>Double,NaN</code> is
583     * returned.
584     * </li></ul></p>
585     *
586     * @return r-square
587     */
588    public double getRSquare() {
589        double ssto = getTotalSumSquares();
590        return (ssto - getSumSquaredErrors()) / ssto;
591    }
592
593    /**
594     * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
595     * standard error of the intercept estimate</a>,
596     * usually denoted s(b0).
597     * <p>
598     * If there are fewer that <strong>three</strong> observations in the
599     * model, or if there is no variation in x, this returns
600     * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
601     * returned when the intercept is constrained to be zero
602     *
603     * @return standard error associated with intercept estimate
604     */
605    public double getInterceptStdErr() {
606        if( !hasIntercept ){
607            return Double.NaN;
608        }
609        return FastMath.sqrt(
610            getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
611    }
612
613    /**
614     * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
615     * error of the slope estimate</a>,
616     * usually denoted s(b1).
617     * <p>
618     * If there are fewer that <strong>three</strong> data pairs in the model,
619     * or if there is no variation in x, this returns <code>Double.NaN</code>.
620     * </p>
621     *
622     * @return standard error associated with slope estimate
623     */
624    public double getSlopeStdErr() {
625        return FastMath.sqrt(getMeanSquareError() / sumXX);
626    }
627
628    /**
629     * Returns the half-width of a 95% confidence interval for the slope
630     * estimate.
631     * <p>
632     * The 95% confidence interval is</p>
633     * <p>
634     * <code>(getSlope() - getSlopeConfidenceInterval(),
635     * getSlope() + getSlopeConfidenceInterval())</code></p>
636     * <p>
637     * If there are fewer that <strong>three</strong> observations in the
638     * model, or if there is no variation in x, this returns
639     * <code>Double.NaN</code>.</p>
640     * <p>
641     * <strong>Usage Note</strong>:<br>
642     * The validity of this statistic depends on the assumption that the
643     * observations included in the model are drawn from a
644     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
645     * Bivariate Normal Distribution</a>.</p>
646     *
647     * @return half-width of 95% confidence interval for the slope estimate
648     * @throws OutOfRangeException if the confidence interval can not be computed.
649     */
650    public double getSlopeConfidenceInterval() throws OutOfRangeException {
651        return getSlopeConfidenceInterval(0.05d);
652    }
653
654    /**
655     * Returns the half-width of a (100-100*alpha)% confidence interval for
656     * the slope estimate.
657     * <p>
658     * The (100-100*alpha)% confidence interval is </p>
659     * <p>
660     * <code>(getSlope() - getSlopeConfidenceInterval(),
661     * getSlope() + getSlopeConfidenceInterval())</code></p>
662     * <p>
663     * To request, for example, a 99% confidence interval, use
664     * <code>alpha = .01</code></p>
665     * <p>
666     * <strong>Usage Note</strong>:<br>
667     * The validity of this statistic depends on the assumption that the
668     * observations included in the model are drawn from a
669     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
670     * Bivariate Normal Distribution</a>.</p>
671     * <p>
672     * <strong> Preconditions:</strong><ul>
673     * <li>If there are fewer that <strong>three</strong> observations in the
674     * model, or if there is no variation in x, this returns
675     * <code>Double.NaN</code>.
676     * </li>
677     * <li><code>(0 < alpha < 1)</code>; otherwise an
678     * <code>OutOfRangeException</code> is thrown.
679     * </li></ul></p>
680     *
681     * @param alpha the desired significance level
682     * @return half-width of 95% confidence interval for the slope estimate
683     * @throws OutOfRangeException if the confidence interval can not be computed.
684     */
685    public double getSlopeConfidenceInterval(final double alpha)
686    throws OutOfRangeException {
687        if (n < 3) {
688            return Double.NaN;
689        }
690        if (alpha >= 1 || alpha <= 0) {
691            throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
692                                          alpha, 0, 1);
693        }
694        // No advertised NotStrictlyPositiveException here - will return NaN above
695        TDistribution distribution = new TDistribution(n - 2);
696        return getSlopeStdErr() *
697            distribution.inverseCumulativeProbability(1d - alpha / 2d);
698    }
699
700    /**
701     * Returns the significance level of the slope (equiv) correlation.
702     * <p>
703     * Specifically, the returned value is the smallest <code>alpha</code>
704     * such that the slope confidence interval with significance level
705     * equal to <code>alpha</code> does not include <code>0</code>.
706     * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
707     * </p><p>
708     * <strong>Usage Note</strong>:<br>
709     * The validity of this statistic depends on the assumption that the
710     * observations included in the model are drawn from a
711     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
712     * Bivariate Normal Distribution</a>.</p>
713     * <p>
714     * If there are fewer that <strong>three</strong> observations in the
715     * model, or if there is no variation in x, this returns
716     * <code>Double.NaN</code>.</p>
717     *
718     * @return significance level for slope/correlation
719     * @throws org.apache.commons.math3.exception.MaxCountExceededException
720     * if the significance level can not be computed.
721     */
722    public double getSignificance() {
723        if (n < 3) {
724            return Double.NaN;
725        }
726        // No advertised NotStrictlyPositiveException here - will return NaN above
727        TDistribution distribution = new TDistribution(n - 2);
728        return 2d * (1.0 - distribution.cumulativeProbability(
729                    FastMath.abs(getSlope()) / getSlopeStdErr()));
730    }
731
732    // ---------------------Private methods-----------------------------------
733
734    /**
735    * Returns the intercept of the estimated regression line, given the slope.
736    * <p>
737    * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
738    *
739    * @param slope current slope
740    * @return the intercept of the regression line
741    */
742    private double getIntercept(final double slope) {
743      if( hasIntercept){
744        return (sumY - slope * sumX) / n;
745      }
746      return 0.0;
747    }
748
749    /**
750     * Computes SSR from b1.
751     *
752     * @param slope regression slope estimate
753     * @return sum of squared deviations of predicted y values
754     */
755    private double getRegressionSumSquares(final double slope) {
756        return slope * slope * sumXX;
757    }
758
759    /**
760     * Performs a regression on data present in buffers and outputs a RegressionResults object.
761     *
762     * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
763     * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
764     * contain at least 2 observations.</p>
765     *
766     * @return RegressionResults acts as a container of regression output
767     * @throws ModelSpecificationException if the model is not correctly specified
768     * @throws NoDataException if there is not sufficient data in the model to
769     * estimate the regression parameters
770     */
771    public RegressionResults regress() throws ModelSpecificationException, NoDataException {
772        if (hasIntercept) {
773            if (n < 3) {
774                throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
775            }
776            if (FastMath.abs(sumXX) > Precision.SAFE_MIN) {
777                final double[] params = new double[] { getIntercept(), getSlope() };
778                final double mse = getMeanSquareError();
779                final double _syy = sumYY + sumY * sumY / n;
780                final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
781                return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true,
782                        false);
783            } else {
784                final double[] params = new double[] { sumY / n, Double.NaN };
785                // final double mse = getMeanSquareError();
786                final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN };
787                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
788                        false);
789            }
790        } else {
791            if (n < 2) {
792                throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
793            }
794            if (!Double.isNaN(sumXX)) {
795                final double[] vcv = new double[] { getMeanSquareError() / sumXX };
796                final double[] params = new double[] { sumXY / sumXX };
797                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
798                        false);
799            } else {
800                final double[] vcv = new double[] { Double.NaN };
801                final double[] params = new double[] { Double.NaN };
802                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
803                        false);
804            }
805        }
806    }
807
808    /**
809     * Performs a regression on data present in buffers including only regressors
810     * indexed in variablesToInclude and outputs a RegressionResults object
811     * @param variablesToInclude an array of indices of regressors to include
812     * @return RegressionResults acts as a container of regression output
813     * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
814     * @throws OutOfRangeException if a requested variable is not present in model
815     */
816    public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
817        if( variablesToInclude == null || variablesToInclude.length == 0){
818          throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
819        }
820        if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
821            throw new ModelSpecificationException(
822                    LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
823                    (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
824        }
825
826        if( hasIntercept ){
827            if( variablesToInclude.length == 2 ){
828                if( variablesToInclude[0] == 1 ){
829                    throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
830                }else if( variablesToInclude[0] != 0 ){
831                    throw new OutOfRangeException( variablesToInclude[0], 0,1 );
832                }
833                if( variablesToInclude[1] != 1){
834                     throw new OutOfRangeException( variablesToInclude[0], 0,1 );
835                }
836                return regress();
837            }else{
838                if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
839                     throw new OutOfRangeException( variablesToInclude[0],0,1 );
840                }
841                final double _mean = sumY * sumY / n;
842                final double _syy = sumYY + _mean;
843                if( variablesToInclude[0] == 0 ){
844                    //just the mean
845                    final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
846                    final double[] params = new double[]{ ybar };
847                    return new RegressionResults(
848                      params, new double[][]{vcv}, true, n, 1,
849                      sumY, _syy+_mean, sumYY,true,false);
850
851                }else if( variablesToInclude[0] == 1){
852                    //final double _syy = sumYY + sumY * sumY / ((double) n);
853                    final double _sxx = sumXX + sumX * sumX / n;
854                    final double _sxy = sumXY + sumX * sumY / n;
855                    final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
856                    final double _mse = _sse/((n-1));
857                    if( !Double.isNaN(_sxx) ){
858                        final double[] vcv = new double[]{ _mse / _sxx };
859                        final double[] params = new double[]{ _sxy/_sxx };
860                        return new RegressionResults(
861                                    params, new double[][]{vcv}, true, n, 1,
862                                    sumY, _syy, _sse,false,false);
863                    }else{
864                        final double[] vcv = new double[]{Double.NaN };
865                        final double[] params = new double[]{ Double.NaN };
866                        return new RegressionResults(
867                                    params, new double[][]{vcv}, true, n, 1,
868                                    Double.NaN, Double.NaN, Double.NaN,false,false);
869                    }
870                }
871            }
872        }else{
873            if( variablesToInclude[0] != 0 ){
874                throw new OutOfRangeException(variablesToInclude[0],0,0);
875            }
876            return regress();
877        }
878
879        return null;
880    }
881}