001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.stat.regression;
019import java.io.Serializable;
020
021import org.apache.commons.math3.distribution.TDistribution;
022import org.apache.commons.math3.exception.MathIllegalArgumentException;
023import org.apache.commons.math3.exception.NoDataException;
024import org.apache.commons.math3.exception.OutOfRangeException;
025import org.apache.commons.math3.exception.util.LocalizedFormats;
026import org.apache.commons.math3.util.FastMath;
027import org.apache.commons.math3.util.Precision;
028
029/**
030 * Estimates an ordinary least squares regression model
031 * with one independent variable.
032 * <p>
033 * <code> y = intercept + slope * x  </code></p>
034 * <p>
035 * Standard errors for <code>intercept</code> and <code>slope</code> are
036 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
037 * <p>
038 * Observations (x,y pairs) can be added to the model one at a time or they
039 * can be provided in a 2-dimensional array.  The observations are not stored
040 * in memory, so there is no limit to the number of observations that can be
041 * added to the model.</p>
042 * <p>
043 * <strong>Usage Notes</strong>: <ul>
044 * <li> When there are fewer than two observations in the model, or when
045 * there is no variation in the x values (i.e. all x values are the same)
046 * all statistics return <code>NaN</code>. At least two observations with
047 * different x coordinates are required to estimate a bivariate regression
048 * model.
049 * </li>
050 * <li> Getters for the statistics always compute values based on the current
051 * set of observations -- i.e., you can get statistics, then add more data
052 * and get updated statistics without using a new instance.  There is no
053 * "compute" method that updates all statistics.  Each of the getters performs
054 * the necessary computations to return the requested statistic.
055 * </li>
056 * <li> The intercept term may be suppressed by passing {@code false} to
057 * the {@link #SimpleRegression(boolean)} constructor.  When the
058 * {@code hasIntercept} property is false, the model is estimated without a
059 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
060 * </ul></p>
061 *
062 * @version $Id: SimpleRegression.java 1519851 2013-09-03 21:16:35Z tn $
063 */
064public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
065
066    /** Serializable version identifier */
067    private static final long serialVersionUID = -3004689053607543335L;
068
069    /** sum of x values */
070    private double sumX = 0d;
071
072    /** total variation in x (sum of squared deviations from xbar) */
073    private double sumXX = 0d;
074
075    /** sum of y values */
076    private double sumY = 0d;
077
078    /** total variation in y (sum of squared deviations from ybar) */
079    private double sumYY = 0d;
080
081    /** sum of products */
082    private double sumXY = 0d;
083
084    /** number of observations */
085    private long n = 0;
086
087    /** mean of accumulated x values, used in updating formulas */
088    private double xbar = 0;
089
090    /** mean of accumulated y values, used in updating formulas */
091    private double ybar = 0;
092
093    /** include an intercept or not */
094    private final boolean hasIntercept;
095    // ---------------------Public methods--------------------------------------
096
097    /**
098     * Create an empty SimpleRegression instance
099     */
100    public SimpleRegression() {
101        this(true);
102    }
103    /**
104    * Create a SimpleRegression instance, specifying whether or not to estimate
105    * an intercept.
106    *
107    * <p>Use {@code false} to estimate a model with no intercept.  When the
108    * {@code hasIntercept} property is false, the model is estimated without a
109    * constant term and {@link #getIntercept()} returns {@code 0}.</p>
110    *
111    * @param includeIntercept whether or not to include an intercept term in
112    * the regression model
113    */
114    public SimpleRegression(boolean includeIntercept) {
115        super();
116        hasIntercept = includeIntercept;
117    }
118
119    /**
120     * Adds the observation (x,y) to the regression data set.
121     * <p>
122     * Uses updating formulas for means and sums of squares defined in
123     * "Algorithms for Computing the Sample Variance: Analysis and
124     * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
125     * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
126     * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
127     *
128     *
129     * @param x independent variable value
130     * @param y dependent variable value
131     */
132    public void addData(final double x,final double y) {
133        if (n == 0) {
134            xbar = x;
135            ybar = y;
136        } else {
137            if( hasIntercept ){
138                final double fact1 = 1.0 + n;
139                final double fact2 = n / (1.0 + n);
140                final double dx = x - xbar;
141                final double dy = y - ybar;
142                sumXX += dx * dx * fact2;
143                sumYY += dy * dy * fact2;
144                sumXY += dx * dy * fact2;
145                xbar += dx / fact1;
146                ybar += dy / fact1;
147            }
148         }
149        if( !hasIntercept ){
150            sumXX += x * x ;
151            sumYY += y * y ;
152            sumXY += x * y ;
153        }
154        sumX += x;
155        sumY += y;
156        n++;
157    }
158
159    /**
160     * Appends data from another regression calculation to this one.
161     *
162     * <p>The mean update formulae are based on a paper written by Philippe
163     * P&eacute;bay:
164     * <a
165     * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
166     * Formulas for Robust, One-Pass Parallel Computation of Covariances and
167     * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
168     * SAND2008-6212, Sandia National Laboratories.</p>
169     *
170     * @param reg model to append data from
171     * @since 3.3
172     */
173    public void append(SimpleRegression reg) {
174        if (n == 0) {
175            xbar = reg.xbar;
176            ybar = reg.ybar;
177            sumXX = reg.sumXX;
178            sumYY = reg.sumYY;
179            sumXY = reg.sumXY;
180        } else {
181            if (hasIntercept) {
182                final double fact1 = reg.n / (double) (reg.n + n);
183                final double fact2 = n * reg.n / (double) (reg.n + n);
184                final double dx = reg.xbar - xbar;
185                final double dy = reg.ybar - ybar;
186                sumXX += reg.sumXX + dx * dx * fact2;
187                sumYY += reg.sumYY + dy * dy * fact2;
188                sumXY += reg.sumXY + dx * dy * fact2;
189                xbar += dx * fact1;
190                ybar += dy * fact1;
191            }else{
192                sumXX += reg.sumXX;
193                sumYY += reg.sumYY;
194                sumXY += reg.sumXY;
195            }
196        }
197        sumX += reg.sumX;
198        sumY += reg.sumY;
199        n += reg.n;
200    }
201
202    /**
203     * Removes the observation (x,y) from the regression data set.
204     * <p>
205     * Mirrors the addData method.  This method permits the use of
206     * SimpleRegression instances in streaming mode where the regression
207     * is applied to a sliding "window" of observations, however the caller is
208     * responsible for maintaining the set of observations in the window.</p>
209     *
210     * The method has no effect if there are no points of data (i.e. n=0)
211     *
212     * @param x independent variable value
213     * @param y dependent variable value
214     */
215    public void removeData(final double x,final double y) {
216        if (n > 0) {
217            if (hasIntercept) {
218                final double fact1 = n - 1.0;
219                final double fact2 = n / (n - 1.0);
220                final double dx = x - xbar;
221                final double dy = y - ybar;
222                sumXX -= dx * dx * fact2;
223                sumYY -= dy * dy * fact2;
224                sumXY -= dx * dy * fact2;
225                xbar -= dx / fact1;
226                ybar -= dy / fact1;
227            } else {
228                final double fact1 = n - 1.0;
229                sumXX -= x * x;
230                sumYY -= y * y;
231                sumXY -= x * y;
232                xbar -= x / fact1;
233                ybar -= y / fact1;
234            }
235             sumX -= x;
236             sumY -= y;
237             n--;
238        }
239    }
240
241    /**
242     * Adds the observations represented by the elements in
243     * <code>data</code>.
244     * <p>
245     * <code>(data[0][0],data[0][1])</code> will be the first observation, then
246     * <code>(data[1][0],data[1][1])</code>, etc.</p>
247     * <p>
248     * This method does not replace data that has already been added.  The
249     * observations represented by <code>data</code> are added to the existing
250     * dataset.</p>
251     * <p>
252     * To replace all data, use <code>clear()</code> before adding the new
253     * data.</p>
254     *
255     * @param data array of observations to be added
256     * @throws ModelSpecificationException if the length of {@code data[i]} is not
257     * greater than or equal to 2
258     */
259    public void addData(final double[][] data) throws ModelSpecificationException {
260        for (int i = 0; i < data.length; i++) {
261            if( data[i].length < 2 ){
262               throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
263                    data[i].length, 2);
264            }
265            addData(data[i][0], data[i][1]);
266        }
267    }
268
269    /**
270     * Adds one observation to the regression model.
271     *
272     * @param x the independent variables which form the design matrix
273     * @param y the dependent or response variable
274     * @throws ModelSpecificationException if the length of {@code x} does not equal
275     * the number of independent variables in the model
276     */
277    public void addObservation(final double[] x,final double y)
278    throws ModelSpecificationException {
279        if( x == null || x.length == 0 ){
280            throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
281        }
282        addData( x[0], y );
283    }
284
285    /**
286     * Adds a series of observations to the regression model. The lengths of
287     * x and y must be the same and x must be rectangular.
288     *
289     * @param x a series of observations on the independent variables
290     * @param y a series of observations on the dependent variable
291     * The length of x and y must be the same
292     * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
293     * the length of {@code y} or does not contain sufficient data to estimate the model
294     */
295    public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
296        if ((x == null) || (y == null) || (x.length != y.length)) {
297            throw new ModelSpecificationException(
298                  LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
299                  (x == null) ? 0 : x.length,
300                  (y == null) ? 0 : y.length);
301        }
302        boolean obsOk=true;
303        for( int i = 0 ; i < x.length; i++){
304            if( x[i] == null || x[i].length == 0 ){
305                obsOk = false;
306            }
307        }
308        if( !obsOk ){
309            throw new ModelSpecificationException(
310                  LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
311                  0, 1);
312        }
313        for( int i = 0 ; i < x.length ; i++){
314            addData( x[i][0], y[i] );
315        }
316    }
317
318    /**
319     * Removes observations represented by the elements in <code>data</code>.
320      * <p>
321     * If the array is larger than the current n, only the first n elements are
322     * processed.  This method permits the use of SimpleRegression instances in
323     * streaming mode where the regression is applied to a sliding "window" of
324     * observations, however the caller is responsible for maintaining the set
325     * of observations in the window.</p>
326     * <p>
327     * To remove all data, use <code>clear()</code>.</p>
328     *
329     * @param data array of observations to be removed
330     */
331    public void removeData(double[][] data) {
332        for (int i = 0; i < data.length && n > 0; i++) {
333            removeData(data[i][0], data[i][1]);
334        }
335    }
336
337    /**
338     * Clears all data from the model.
339     */
340    public void clear() {
341        sumX = 0d;
342        sumXX = 0d;
343        sumY = 0d;
344        sumYY = 0d;
345        sumXY = 0d;
346        n = 0;
347    }
348
349    /**
350     * Returns the number of observations that have been added to the model.
351     *
352     * @return n number of observations that have been added.
353     */
354    public long getN() {
355        return n;
356    }
357
358    /**
359     * Returns the "predicted" <code>y</code> value associated with the
360     * supplied <code>x</code> value,  based on the data that has been
361     * added to the model when this method is activated.
362     * <p>
363     * <code> predict(x) = intercept + slope * x </code></p>
364     * <p>
365     * <strong>Preconditions</strong>: <ul>
366     * <li>At least two observations (with at least two different x values)
367     * must have been added before invoking this method. If this method is
368     * invoked before a model can be estimated, <code>Double,NaN</code> is
369     * returned.
370     * </li></ul></p>
371     *
372     * @param x input <code>x</code> value
373     * @return predicted <code>y</code> value
374     */
375    public double predict(final double x) {
376        final double b1 = getSlope();
377        if (hasIntercept) {
378            return getIntercept(b1) + b1 * x;
379        }
380        return b1 * x;
381    }
382
383    /**
384     * Returns the intercept of the estimated regression line, if
385     * {@link #hasIntercept()} is true; otherwise 0.
386     * <p>
387     * The least squares estimate of the intercept is computed using the
388     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
389     * The intercept is sometimes denoted b0.</p>
390     * <p>
391     * <strong>Preconditions</strong>: <ul>
392     * <li>At least two observations (with at least two different x values)
393     * must have been added before invoking this method. If this method is
394     * invoked before a model can be estimated, <code>Double,NaN</code> is
395     * returned.
396     * </li></ul></p>
397     *
398     * @return the intercept of the regression line if the model includes an
399     * intercept; 0 otherwise
400     * @see #SimpleRegression(boolean)
401     */
402    public double getIntercept() {
403        return hasIntercept ? getIntercept(getSlope()) : 0.0;
404    }
405
406    /**
407     * Returns true if the model includes an intercept term.
408     *
409     * @return true if the regression includes an intercept; false otherwise
410     * @see #SimpleRegression(boolean)
411     */
412    public boolean hasIntercept() {
413        return hasIntercept;
414    }
415
416    /**
417    * Returns the slope of the estimated regression line.
418    * <p>
419    * The least squares estimate of the slope is computed using the
420    * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
421    * The slope is sometimes denoted b1.</p>
422    * <p>
423    * <strong>Preconditions</strong>: <ul>
424    * <li>At least two observations (with at least two different x values)
425    * must have been added before invoking this method. If this method is
426    * invoked before a model can be estimated, <code>Double.NaN</code> is
427    * returned.
428    * </li></ul></p>
429    *
430    * @return the slope of the regression line
431    */
432    public double getSlope() {
433        if (n < 2) {
434            return Double.NaN; //not enough data
435        }
436        if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
437            return Double.NaN; //not enough variation in x
438        }
439        return sumXY / sumXX;
440    }
441
442    /**
443     * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
444     * sum of squared errors</a> (SSE) associated with the regression
445     * model.
446     * <p>
447     * The sum is computed using the computational formula</p>
448     * <p>
449     * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
450     * <p>
451     * where <code>SYY</code> is the sum of the squared deviations of the y
452     * values about their mean, <code>SXX</code> is similarly defined and
453     * <code>SXY</code> is the sum of the products of x and y mean deviations.
454     * </p><p>
455     * The sums are accumulated using the updating algorithm referenced in
456     * {@link #addData}.</p>
457     * <p>
458     * The return value is constrained to be non-negative - i.e., if due to
459     * rounding errors the computational formula returns a negative result,
460     * 0 is returned.</p>
461     * <p>
462     * <strong>Preconditions</strong>: <ul>
463     * <li>At least two observations (with at least two different x values)
464     * must have been added before invoking this method. If this method is
465     * invoked before a model can be estimated, <code>Double,NaN</code> is
466     * returned.
467     * </li></ul></p>
468     *
469     * @return sum of squared errors associated with the regression model
470     */
471    public double getSumSquaredErrors() {
472        return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
473    }
474
475    /**
476     * Returns the sum of squared deviations of the y values about their mean.
477     * <p>
478     * This is defined as SSTO
479     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
480     * <p>
481     * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
482     *
483     * @return sum of squared deviations of y values
484     */
485    public double getTotalSumSquares() {
486        if (n < 2) {
487            return Double.NaN;
488        }
489        return sumYY;
490    }
491
492    /**
493     * Returns the sum of squared deviations of the x values about their mean.
494     *
495     * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
496     *
497     * @return sum of squared deviations of x values
498     */
499    public double getXSumSquares() {
500        if (n < 2) {
501            return Double.NaN;
502        }
503        return sumXX;
504    }
505
506    /**
507     * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
508     *
509     * @return sum of cross products
510     */
511    public double getSumOfCrossProducts() {
512        return sumXY;
513    }
514
515    /**
516     * Returns the sum of squared deviations of the predicted y values about
517     * their mean (which equals the mean of y).
518     * <p>
519     * This is usually abbreviated SSR or SSM.  It is defined as SSM
520     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
521     * <p>
522     * <strong>Preconditions</strong>: <ul>
523     * <li>At least two observations (with at least two different x values)
524     * must have been added before invoking this method. If this method is
525     * invoked before a model can be estimated, <code>Double.NaN</code> is
526     * returned.
527     * </li></ul></p>
528     *
529     * @return sum of squared deviations of predicted y values
530     */
531    public double getRegressionSumSquares() {
532        return getRegressionSumSquares(getSlope());
533    }
534
535    /**
536     * Returns the sum of squared errors divided by the degrees of freedom,
537     * usually abbreviated MSE.
538     * <p>
539     * If there are fewer than <strong>three</strong> data pairs in the model,
540     * or if there is no variation in <code>x</code>, this returns
541     * <code>Double.NaN</code>.</p>
542     *
543     * @return sum of squared deviations of y values
544     */
545    public double getMeanSquareError() {
546        if (n < 3) {
547            return Double.NaN;
548        }
549        return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
550    }
551
552    /**
553     * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
554     * Pearson's product moment correlation coefficient</a>,
555     * usually denoted r.
556     * <p>
557     * <strong>Preconditions</strong>: <ul>
558     * <li>At least two observations (with at least two different x values)
559     * must have been added before invoking this method. If this method is
560     * invoked before a model can be estimated, <code>Double,NaN</code> is
561     * returned.
562     * </li></ul></p>
563     *
564     * @return Pearson's r
565     */
566    public double getR() {
567        double b1 = getSlope();
568        double result = FastMath.sqrt(getRSquare());
569        if (b1 < 0) {
570            result = -result;
571        }
572        return result;
573    }
574
575    /**
576     * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
577     * coefficient of determination</a>,
578     * usually denoted r-square.
579     * <p>
580     * <strong>Preconditions</strong>: <ul>
581     * <li>At least two observations (with at least two different x values)
582     * must have been added before invoking this method. If this method is
583     * invoked before a model can be estimated, <code>Double,NaN</code> is
584     * returned.
585     * </li></ul></p>
586     *
587     * @return r-square
588     */
589    public double getRSquare() {
590        double ssto = getTotalSumSquares();
591        return (ssto - getSumSquaredErrors()) / ssto;
592    }
593
594    /**
595     * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
596     * standard error of the intercept estimate</a>,
597     * usually denoted s(b0).
598     * <p>
599     * If there are fewer that <strong>three</strong> observations in the
600     * model, or if there is no variation in x, this returns
601     * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
602     * returned when the intercept is constrained to be zero
603     *
604     * @return standard error associated with intercept estimate
605     */
606    public double getInterceptStdErr() {
607        if( !hasIntercept ){
608            return Double.NaN;
609        }
610        return FastMath.sqrt(
611            getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
612    }
613
614    /**
615     * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
616     * error of the slope estimate</a>,
617     * usually denoted s(b1).
618     * <p>
619     * If there are fewer that <strong>three</strong> data pairs in the model,
620     * or if there is no variation in x, this returns <code>Double.NaN</code>.
621     * </p>
622     *
623     * @return standard error associated with slope estimate
624     */
625    public double getSlopeStdErr() {
626        return FastMath.sqrt(getMeanSquareError() / sumXX);
627    }
628
629    /**
630     * Returns the half-width of a 95% confidence interval for the slope
631     * estimate.
632     * <p>
633     * The 95% confidence interval is</p>
634     * <p>
635     * <code>(getSlope() - getSlopeConfidenceInterval(),
636     * getSlope() + getSlopeConfidenceInterval())</code></p>
637     * <p>
638     * If there are fewer that <strong>three</strong> observations in the
639     * model, or if there is no variation in x, this returns
640     * <code>Double.NaN</code>.</p>
641     * <p>
642     * <strong>Usage Note</strong>:<br>
643     * The validity of this statistic depends on the assumption that the
644     * observations included in the model are drawn from a
645     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
646     * Bivariate Normal Distribution</a>.</p>
647     *
648     * @return half-width of 95% confidence interval for the slope estimate
649     * @throws OutOfRangeException if the confidence interval can not be computed.
650     */
651    public double getSlopeConfidenceInterval() throws OutOfRangeException {
652        return getSlopeConfidenceInterval(0.05d);
653    }
654
655    /**
656     * Returns the half-width of a (100-100*alpha)% confidence interval for
657     * the slope estimate.
658     * <p>
659     * The (100-100*alpha)% confidence interval is </p>
660     * <p>
661     * <code>(getSlope() - getSlopeConfidenceInterval(),
662     * getSlope() + getSlopeConfidenceInterval())</code></p>
663     * <p>
664     * To request, for example, a 99% confidence interval, use
665     * <code>alpha = .01</code></p>
666     * <p>
667     * <strong>Usage Note</strong>:<br>
668     * The validity of this statistic depends on the assumption that the
669     * observations included in the model are drawn from a
670     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
671     * Bivariate Normal Distribution</a>.</p>
672     * <p>
673     * <strong> Preconditions:</strong><ul>
674     * <li>If there are fewer that <strong>three</strong> observations in the
675     * model, or if there is no variation in x, this returns
676     * <code>Double.NaN</code>.
677     * </li>
678     * <li><code>(0 < alpha < 1)</code>; otherwise an
679     * <code>OutOfRangeException</code> is thrown.
680     * </li></ul></p>
681     *
682     * @param alpha the desired significance level
683     * @return half-width of 95% confidence interval for the slope estimate
684     * @throws OutOfRangeException if the confidence interval can not be computed.
685     */
686    public double getSlopeConfidenceInterval(final double alpha)
687    throws OutOfRangeException {
688        if (n < 3) {
689            return Double.NaN;
690        }
691        if (alpha >= 1 || alpha <= 0) {
692            throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
693                                          alpha, 0, 1);
694        }
695        // No advertised NotStrictlyPositiveException here - will return NaN above
696        TDistribution distribution = new TDistribution(n - 2);
697        return getSlopeStdErr() *
698            distribution.inverseCumulativeProbability(1d - alpha / 2d);
699    }
700
701    /**
702     * Returns the significance level of the slope (equiv) correlation.
703     * <p>
704     * Specifically, the returned value is the smallest <code>alpha</code>
705     * such that the slope confidence interval with significance level
706     * equal to <code>alpha</code> does not include <code>0</code>.
707     * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
708     * </p><p>
709     * <strong>Usage Note</strong>:<br>
710     * The validity of this statistic depends on the assumption that the
711     * observations included in the model are drawn from a
712     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
713     * Bivariate Normal Distribution</a>.</p>
714     * <p>
715     * If there are fewer that <strong>three</strong> observations in the
716     * model, or if there is no variation in x, this returns
717     * <code>Double.NaN</code>.</p>
718     *
719     * @return significance level for slope/correlation
720     * @throws org.apache.commons.math3.exception.MaxCountExceededException
721     * if the significance level can not be computed.
722     */
723    public double getSignificance() {
724        if (n < 3) {
725            return Double.NaN;
726        }
727        // No advertised NotStrictlyPositiveException here - will return NaN above
728        TDistribution distribution = new TDistribution(n - 2);
729        return 2d * (1.0 - distribution.cumulativeProbability(
730                    FastMath.abs(getSlope()) / getSlopeStdErr()));
731    }
732
733    // ---------------------Private methods-----------------------------------
734
735    /**
736    * Returns the intercept of the estimated regression line, given the slope.
737    * <p>
738    * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
739    *
740    * @param slope current slope
741    * @return the intercept of the regression line
742    */
743    private double getIntercept(final double slope) {
744      if( hasIntercept){
745        return (sumY - slope * sumX) / n;
746      }
747      return 0.0;
748    }
749
750    /**
751     * Computes SSR from b1.
752     *
753     * @param slope regression slope estimate
754     * @return sum of squared deviations of predicted y values
755     */
756    private double getRegressionSumSquares(final double slope) {
757        return slope * slope * sumXX;
758    }
759
760    /**
761     * Performs a regression on data present in buffers and outputs a RegressionResults object.
762     *
763     * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
764     * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
765     * contain at least 2 observations.</p>
766     *
767     * @return RegressionResults acts as a container of regression output
768     * @throws ModelSpecificationException if the model is not correctly specified
769     * @throws NoDataException if there is not sufficient data in the model to
770     * estimate the regression parameters
771     */
772    public RegressionResults regress() throws ModelSpecificationException, NoDataException {
773        if (hasIntercept) {
774          if( n < 3 ){
775              throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
776          }
777          if( FastMath.abs( sumXX ) > Precision.SAFE_MIN ){
778              final double[] params = new double[]{ getIntercept(), getSlope() };
779              final double mse = getMeanSquareError();
780              final double _syy = sumYY + sumY * sumY / n;
781              final double[] vcv = new double[]{
782                mse * (xbar *xbar /sumXX + 1.0 / n),
783                -xbar*mse/sumXX,
784                mse/sumXX };
785              return new RegressionResults(
786                      params, new double[][]{vcv}, true, n, 2,
787                      sumY, _syy, getSumSquaredErrors(),true,false);
788          }else{
789              final double[] params = new double[]{ sumY / n, Double.NaN };
790              //final double mse = getMeanSquareError();
791              final double[] vcv = new double[]{
792                ybar / (n - 1.0),
793                Double.NaN,
794                Double.NaN };
795              return new RegressionResults(
796                      params, new double[][]{vcv}, true, n, 1,
797                      sumY, sumYY, getSumSquaredErrors(),true,false);
798          }
799        }else{
800          if (n < 2) {
801              throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
802          }
803          if( !Double.isNaN(sumXX) ){
804          final double[] vcv = new double[]{ getMeanSquareError() / sumXX };
805          final double[] params = new double[]{ sumXY/sumXX };
806          return new RegressionResults(
807                      params, new double[][]{vcv}, true, n, 1,
808                      sumY, sumYY, getSumSquaredErrors(),false,false);
809          }else{
810          final double[] vcv = new double[]{Double.NaN };
811          final double[] params = new double[]{ Double.NaN };
812          return new RegressionResults(
813                      params, new double[][]{vcv}, true, n, 1,
814                      Double.NaN, Double.NaN, Double.NaN,false,false);
815          }
816        }
817    }
818
819    /**
820     * Performs a regression on data present in buffers including only regressors
821     * indexed in variablesToInclude and outputs a RegressionResults object
822     * @param variablesToInclude an array of indices of regressors to include
823     * @return RegressionResults acts as a container of regression output
824     * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
825     * @throws OutOfRangeException if a requested variable is not present in model
826     */
827    public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
828        if( variablesToInclude == null || variablesToInclude.length == 0){
829          throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
830        }
831        if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
832            throw new ModelSpecificationException(
833                    LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
834                    (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
835        }
836
837        if( hasIntercept ){
838            if( variablesToInclude.length == 2 ){
839                if( variablesToInclude[0] == 1 ){
840                    throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
841                }else if( variablesToInclude[0] != 0 ){
842                    throw new OutOfRangeException( variablesToInclude[0], 0,1 );
843                }
844                if( variablesToInclude[1] != 1){
845                     throw new OutOfRangeException( variablesToInclude[0], 0,1 );
846                }
847                return regress();
848            }else{
849                if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
850                     throw new OutOfRangeException( variablesToInclude[0],0,1 );
851                }
852                final double _mean = sumY * sumY / n;
853                final double _syy = sumYY + _mean;
854                if( variablesToInclude[0] == 0 ){
855                    //just the mean
856                    final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
857                    final double[] params = new double[]{ ybar };
858                    return new RegressionResults(
859                      params, new double[][]{vcv}, true, n, 1,
860                      sumY, _syy+_mean, sumYY,true,false);
861
862                }else if( variablesToInclude[0] == 1){
863                    //final double _syy = sumYY + sumY * sumY / ((double) n);
864                    final double _sxx = sumXX + sumX * sumX / n;
865                    final double _sxy = sumXY + sumX * sumY / n;
866                    final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
867                    final double _mse = _sse/((n-1));
868                    if( !Double.isNaN(_sxx) ){
869                        final double[] vcv = new double[]{ _mse / _sxx };
870                        final double[] params = new double[]{ _sxy/_sxx };
871                        return new RegressionResults(
872                                    params, new double[][]{vcv}, true, n, 1,
873                                    sumY, _syy, _sse,false,false);
874                    }else{
875                        final double[] vcv = new double[]{Double.NaN };
876                        final double[] params = new double[]{ Double.NaN };
877                        return new RegressionResults(
878                                    params, new double[][]{vcv}, true, n, 1,
879                                    Double.NaN, Double.NaN, Double.NaN,false,false);
880                    }
881                }
882            }
883        }else{
884            if( variablesToInclude[0] != 0 ){
885                throw new OutOfRangeException(variablesToInclude[0],0,0);
886            }
887            return regress();
888        }
889
890        return null;
891    }
892}