001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math4.stat.regression;
019import java.io.Serializable;
020
021import org.apache.commons.statistics.distribution.TDistribution;
022import org.apache.commons.math4.exception.MathIllegalArgumentException;
023import org.apache.commons.math4.exception.NoDataException;
024import org.apache.commons.math4.exception.OutOfRangeException;
025import org.apache.commons.math4.exception.util.LocalizedFormats;
026import org.apache.commons.math4.util.FastMath;
027import org.apache.commons.numbers.core.Precision;
028
029/**
030 * Estimates an ordinary least squares regression model
031 * with one independent variable.
032 * <p>
033 * <code> y = intercept + slope * x  </code></p>
034 * <p>
035 * Standard errors for <code>intercept</code> and <code>slope</code> are
036 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
037 * <p>
038 * Observations (x,y pairs) can be added to the model one at a time or they
039 * can be provided in a 2-dimensional array.  The observations are not stored
040 * in memory, so there is no limit to the number of observations that can be
041 * added to the model.</p>
042 * <p>
043 * <strong>Usage Notes</strong>: <ul>
044 * <li> When there are fewer than two observations in the model, or when
045 * there is no variation in the x values (i.e. all x values are the same)
046 * all statistics return <code>NaN</code>. At least two observations with
047 * different x coordinates are required to estimate a bivariate regression
048 * model.
049 * </li>
050 * <li> Getters for the statistics always compute values based on the current
051 * set of observations -- i.e., you can get statistics, then add more data
052 * and get updated statistics without using a new instance.  There is no
053 * "compute" method that updates all statistics.  Each of the getters performs
054 * the necessary computations to return the requested statistic.
055 * </li>
056 * <li> The intercept term may be suppressed by passing {@code false} to
057 * the {@link #SimpleRegression(boolean)} constructor.  When the
058 * {@code hasIntercept} property is false, the model is estimated without a
059 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
060 * </ul>
061 *
062 */
063public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
064
065    /** Serializable version identifier */
066    private static final long serialVersionUID = -3004689053607543335L;
067
068    /** sum of x values */
069    private double sumX;
070
071    /** total variation in x (sum of squared deviations from xbar) */
072    private double sumXX;
073
074    /** sum of y values */
075    private double sumY;
076
077    /** total variation in y (sum of squared deviations from ybar) */
078    private double sumYY;
079
080    /** sum of products */
081    private double sumXY;
082
083    /** number of observations */
084    private long n;
085
086    /** mean of accumulated x values, used in updating formulas */
087    private double xbar;
088
089    /** mean of accumulated y values, used in updating formulas */
090    private double ybar;
091
092    /** include an intercept or not */
093    private final boolean hasIntercept;
094    // ---------------------Public methods--------------------------------------
095
096    /**
097     * Create an empty SimpleRegression instance
098     */
099    public SimpleRegression() {
100        this(true);
101    }
102    /**
103    * Create a SimpleRegression instance, specifying whether or not to estimate
104    * an intercept.
105    *
106    * <p>Use {@code false} to estimate a model with no intercept.  When the
107    * {@code hasIntercept} property is false, the model is estimated without a
108    * constant term and {@link #getIntercept()} returns {@code 0}.</p>
109    *
110    * @param includeIntercept whether or not to include an intercept term in
111    * the regression model
112    */
113    public SimpleRegression(boolean includeIntercept) {
114        super();
115        hasIntercept = includeIntercept;
116    }
117
118    /**
119     * Adds the observation (x,y) to the regression data set.
120     * <p>
121     * Uses updating formulas for means and sums of squares defined in
122     * "Algorithms for Computing the Sample Variance: Analysis and
123     * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
124     * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
125     * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
126     *
127     *
128     * @param x independent variable value
129     * @param y dependent variable value
130     */
131    public void addData(final double x,final double y) {
132        if (n == 0) {
133            xbar = x;
134            ybar = y;
135        } else {
136            if( hasIntercept ){
137                final double fact1 = 1.0 + n;
138                final double fact2 = n / (1.0 + n);
139                final double dx = x - xbar;
140                final double dy = y - ybar;
141                sumXX += dx * dx * fact2;
142                sumYY += dy * dy * fact2;
143                sumXY += dx * dy * fact2;
144                xbar += dx / fact1;
145                ybar += dy / fact1;
146            }
147         }
148        if( !hasIntercept ){
149            sumXX += x * x ;
150            sumYY += y * y ;
151            sumXY += x * y ;
152        }
153        sumX += x;
154        sumY += y;
155        n++;
156    }
157
158    /**
159     * Appends data from another regression calculation to this one.
160     *
161     * <p>The mean update formulae are based on a paper written by Philippe
162     * P&eacute;bay:
163     * <a
164     * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
165     * Formulas for Robust, One-Pass Parallel Computation of Covariances and
166     * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
167     * SAND2008-6212, Sandia National Laboratories.</p>
168     *
169     * @param reg model to append data from
170     * @since 3.3
171     */
172    public void append(SimpleRegression reg) {
173        if (n == 0) {
174            xbar = reg.xbar;
175            ybar = reg.ybar;
176            sumXX = reg.sumXX;
177            sumYY = reg.sumYY;
178            sumXY = reg.sumXY;
179        } else {
180            if (hasIntercept) {
181                final double fact1 = reg.n / (double) (reg.n + n);
182                final double fact2 = n * reg.n / (double) (reg.n + n);
183                final double dx = reg.xbar - xbar;
184                final double dy = reg.ybar - ybar;
185                sumXX += reg.sumXX + dx * dx * fact2;
186                sumYY += reg.sumYY + dy * dy * fact2;
187                sumXY += reg.sumXY + dx * dy * fact2;
188                xbar += dx * fact1;
189                ybar += dy * fact1;
190            }else{
191                sumXX += reg.sumXX;
192                sumYY += reg.sumYY;
193                sumXY += reg.sumXY;
194            }
195        }
196        sumX += reg.sumX;
197        sumY += reg.sumY;
198        n += reg.n;
199    }
200
201    /**
202     * Removes the observation (x,y) from the regression data set.
203     * <p>
204     * Mirrors the addData method.  This method permits the use of
205     * SimpleRegression instances in streaming mode where the regression
206     * is applied to a sliding "window" of observations, however the caller is
207     * responsible for maintaining the set of observations in the window.</p>
208     *
209     * The method has no effect if there are no points of data (i.e. n=0)
210     *
211     * @param x independent variable value
212     * @param y dependent variable value
213     */
214    public void removeData(final double x,final double y) {
215        if (n > 0) {
216            if (hasIntercept) {
217                final double fact1 = n - 1.0;
218                final double fact2 = n / (n - 1.0);
219                final double dx = x - xbar;
220                final double dy = y - ybar;
221                sumXX -= dx * dx * fact2;
222                sumYY -= dy * dy * fact2;
223                sumXY -= dx * dy * fact2;
224                xbar -= dx / fact1;
225                ybar -= dy / fact1;
226            } else {
227                final double fact1 = n - 1.0;
228                sumXX -= x * x;
229                sumYY -= y * y;
230                sumXY -= x * y;
231                xbar -= x / fact1;
232                ybar -= y / fact1;
233            }
234             sumX -= x;
235             sumY -= y;
236             n--;
237        }
238    }
239
240    /**
241     * Adds the observations represented by the elements in
242     * <code>data</code>.
243     * <p>
244     * <code>(data[0][0],data[0][1])</code> will be the first observation, then
245     * <code>(data[1][0],data[1][1])</code>, etc.</p>
246     * <p>
247     * This method does not replace data that has already been added.  The
248     * observations represented by <code>data</code> are added to the existing
249     * dataset.</p>
250     * <p>
251     * To replace all data, use <code>clear()</code> before adding the new
252     * data.</p>
253     *
254     * @param data array of observations to be added
255     * @throws ModelSpecificationException if the length of {@code data[i]} is not
256     * greater than or equal to 2
257     */
258    public void addData(final double[][] data) throws ModelSpecificationException {
259        for (int i = 0; i < data.length; i++) {
260            if( data[i].length < 2 ){
261               throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
262                    data[i].length, 2);
263            }
264            addData(data[i][0], data[i][1]);
265        }
266    }
267
268    /**
269     * Adds one observation to the regression model.
270     *
271     * @param x the independent variables which form the design matrix
272     * @param y the dependent or response variable
273     * @throws ModelSpecificationException if the length of {@code x} does not equal
274     * the number of independent variables in the model
275     */
276    @Override
277    public void addObservation(final double[] x,final double y)
278            throws ModelSpecificationException {
279        if( x == null || x.length == 0 ){
280            throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
281        }
282        addData( x[0], y );
283    }
284
285    /**
286     * Adds a series of observations to the regression model. The lengths of
287     * x and y must be the same and x must be rectangular.
288     *
289     * @param x a series of observations on the independent variables
290     * @param y a series of observations on the dependent variable
291     * The length of x and y must be the same
292     * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
293     * the length of {@code y} or does not contain sufficient data to estimate the model
294     */
295    @Override
296    public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
297        if ((x == null) || (y == null) || (x.length != y.length)) {
298            throw new ModelSpecificationException(
299                  LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
300                  (x == null) ? 0 : x.length,
301                  (y == null) ? 0 : y.length);
302        }
303        boolean obsOk = true;
304        for( int i = 0 ; i < x.length; i++){
305            if( x[i] == null || x[i].length == 0 ){
306                obsOk = false;
307                break;
308            }
309        }
310        if( !obsOk ){
311            throw new ModelSpecificationException(
312                  LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
313                  0, 1);
314        }
315        for( int i = 0 ; i < x.length ; i++){
316            addData( x[i][0], y[i] );
317        }
318    }
319
320    /**
321     * Removes observations represented by the elements in <code>data</code>.
322      * <p>
323     * If the array is larger than the current n, only the first n elements are
324     * processed.  This method permits the use of SimpleRegression instances in
325     * streaming mode where the regression is applied to a sliding "window" of
326     * observations, however the caller is responsible for maintaining the set
327     * of observations in the window.</p>
328     * <p>
329     * To remove all data, use <code>clear()</code>.</p>
330     *
331     * @param data array of observations to be removed
332     */
333    public void removeData(double[][] data) {
334        for (int i = 0; i < data.length && n > 0; i++) {
335            removeData(data[i][0], data[i][1]);
336        }
337    }
338
339    /**
340     * Clears all data from the model.
341     */
342    @Override
343    public void clear() {
344        sumX = 0d;
345        sumXX = 0d;
346        sumY = 0d;
347        sumYY = 0d;
348        sumXY = 0d;
349        n = 0;
350    }
351
352    /**
353     * Returns the number of observations that have been added to the model.
354     *
355     * @return n number of observations that have been added.
356     */
357    @Override
358    public long getN() {
359        return n;
360    }
361
362    /**
363     * Returns the "predicted" <code>y</code> value associated with the
364     * supplied <code>x</code> value,  based on the data that has been
365     * added to the model when this method is activated.
366     * <p>
367     * <code> predict(x) = intercept + slope * x </code></p>
368     * <p>
369     * <strong>Preconditions</strong>: <ul>
370     * <li>At least two observations (with at least two different x values)
371     * must have been added before invoking this method. If this method is
372     * invoked before a model can be estimated, <code>Double,NaN</code> is
373     * returned.
374     * </li></ul>
375     *
376     * @param x input <code>x</code> value
377     * @return predicted <code>y</code> value
378     */
379    public double predict(final double x) {
380        final double b1 = getSlope();
381        if (hasIntercept) {
382            return getIntercept(b1) + b1 * x;
383        }
384        return b1 * x;
385    }
386
387    /**
388     * Returns the intercept of the estimated regression line, if
389     * {@link #hasIntercept()} is true; otherwise 0.
390     * <p>
391     * The least squares estimate of the intercept is computed using the
392     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
393     * The intercept is sometimes denoted b0.</p>
394     * <p>
395     * <strong>Preconditions</strong>: <ul>
396     * <li>At least two observations (with at least two different x values)
397     * must have been added before invoking this method. If this method is
398     * invoked before a model can be estimated, <code>Double,NaN</code> is
399     * returned.
400     * </li></ul>
401     *
402     * @return the intercept of the regression line if the model includes an
403     * intercept; 0 otherwise
404     * @see #SimpleRegression(boolean)
405     */
406    public double getIntercept() {
407        return hasIntercept ? getIntercept(getSlope()) : 0.0;
408    }
409
410    /**
411     * Returns true if the model includes an intercept term.
412     *
413     * @return true if the regression includes an intercept; false otherwise
414     * @see #SimpleRegression(boolean)
415     */
416    @Override
417    public boolean hasIntercept() {
418        return hasIntercept;
419    }
420
421    /**
422    * Returns the slope of the estimated regression line.
423    * <p>
424    * The least squares estimate of the slope is computed using the
425    * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
426    * The slope is sometimes denoted b1.</p>
427    * <p>
428    * <strong>Preconditions</strong>: <ul>
429    * <li>At least two observations (with at least two different x values)
430    * must have been added before invoking this method. If this method is
431    * invoked before a model can be estimated, <code>Double.NaN</code> is
432    * returned.
433    * </li></ul>
434    *
435    * @return the slope of the regression line
436    */
437    public double getSlope() {
438        if (n < 2) {
439            return Double.NaN; //not enough data
440        }
441        if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
442            return Double.NaN; //not enough variation in x
443        }
444        return sumXY / sumXX;
445    }
446
447    /**
448     * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
449     * sum of squared errors</a> (SSE) associated with the regression
450     * model.
451     * <p>
452     * The sum is computed using the computational formula</p>
453     * <p>
454     * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
455     * <p>
456     * where <code>SYY</code> is the sum of the squared deviations of the y
457     * values about their mean, <code>SXX</code> is similarly defined and
458     * <code>SXY</code> is the sum of the products of x and y mean deviations.
459     * </p><p>
460     * The sums are accumulated using the updating algorithm referenced in
461     * {@link #addData}.</p>
462     * <p>
463     * The return value is constrained to be non-negative - i.e., if due to
464     * rounding errors the computational formula returns a negative result,
465     * 0 is returned.</p>
466     * <p>
467     * <strong>Preconditions</strong>: <ul>
468     * <li>At least two observations (with at least two different x values)
469     * must have been added before invoking this method. If this method is
470     * invoked before a model can be estimated, <code>Double,NaN</code> is
471     * returned.
472     * </li></ul>
473     *
474     * @return sum of squared errors associated with the regression model
475     */
476    public double getSumSquaredErrors() {
477        return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
478    }
479
480    /**
481     * Returns the sum of squared deviations of the y values about their mean.
482     * <p>
483     * This is defined as SSTO
484     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
485     * <p>
486     * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
487     *
488     * @return sum of squared deviations of y values
489     */
490    public double getTotalSumSquares() {
491        if (n < 2) {
492            return Double.NaN;
493        }
494        return sumYY;
495    }
496
497    /**
498     * Returns the sum of squared deviations of the x values about their mean.
499     *
500     * If <code>n &lt; 2</code>, this returns <code>Double.NaN</code>.
501     *
502     * @return sum of squared deviations of x values
503     */
504    public double getXSumSquares() {
505        if (n < 2) {
506            return Double.NaN;
507        }
508        return sumXX;
509    }
510
511    /**
512     * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
513     *
514     * @return sum of cross products
515     */
516    public double getSumOfCrossProducts() {
517        return sumXY;
518    }
519
520    /**
521     * Returns the sum of squared deviations of the predicted y values about
522     * their mean (which equals the mean of y).
523     * <p>
524     * This is usually abbreviated SSR or SSM.  It is defined as SSM
525     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
526     * <p>
527     * <strong>Preconditions</strong>: <ul>
528     * <li>At least two observations (with at least two different x values)
529     * must have been added before invoking this method. If this method is
530     * invoked before a model can be estimated, <code>Double.NaN</code> is
531     * returned.
532     * </li></ul>
533     *
534     * @return sum of squared deviations of predicted y values
535     */
536    public double getRegressionSumSquares() {
537        return getRegressionSumSquares(getSlope());
538    }
539
540    /**
541     * Returns the sum of squared errors divided by the degrees of freedom,
542     * usually abbreviated MSE.
543     * <p>
544     * If there are fewer than <strong>three</strong> data pairs in the model,
545     * or if there is no variation in <code>x</code>, this returns
546     * <code>Double.NaN</code>.</p>
547     *
548     * @return sum of squared deviations of y values
549     */
550    public double getMeanSquareError() {
551        if (n < 3) {
552            return Double.NaN;
553        }
554        return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
555    }
556
557    /**
558     * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
559     * Pearson's product moment correlation coefficient</a>,
560     * usually denoted r.
561     * <p>
562     * <strong>Preconditions</strong>: <ul>
563     * <li>At least two observations (with at least two different x values)
564     * must have been added before invoking this method. If this method is
565     * invoked before a model can be estimated, <code>Double,NaN</code> is
566     * returned.
567     * </li></ul>
568     *
569     * @return Pearson's r
570     */
571    public double getR() {
572        double b1 = getSlope();
573        double result = FastMath.sqrt(getRSquare());
574        if (b1 < 0) {
575            result = -result;
576        }
577        return result;
578    }
579
580    /**
581     * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
582     * coefficient of determination</a>,
583     * usually denoted r-square.
584     * <p>
585     * <strong>Preconditions</strong>: <ul>
586     * <li>At least two observations (with at least two different x values)
587     * must have been added before invoking this method. If this method is
588     * invoked before a model can be estimated, <code>Double,NaN</code> is
589     * returned.
590     * </li></ul>
591     *
592     * @return r-square
593     */
594    public double getRSquare() {
595        double ssto = getTotalSumSquares();
596        return (ssto - getSumSquaredErrors()) / ssto;
597    }
598
599    /**
600     * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
601     * standard error of the intercept estimate</a>,
602     * usually denoted s(b0).
603     * <p>
604     * If there are fewer that <strong>three</strong> observations in the
605     * model, or if there is no variation in x, this returns
606     * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
607     * returned when the intercept is constrained to be zero
608     *
609     * @return standard error associated with intercept estimate
610     */
611    public double getInterceptStdErr() {
612        if( !hasIntercept ){
613            return Double.NaN;
614        }
615        return FastMath.sqrt(
616            getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
617    }
618
619    /**
620     * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
621     * error of the slope estimate</a>,
622     * usually denoted s(b1).
623     * <p>
624     * If there are fewer that <strong>three</strong> data pairs in the model,
625     * or if there is no variation in x, this returns <code>Double.NaN</code>.
626     * </p>
627     *
628     * @return standard error associated with slope estimate
629     */
630    public double getSlopeStdErr() {
631        return FastMath.sqrt(getMeanSquareError() / sumXX);
632    }
633
634    /**
635     * Returns the half-width of a 95% confidence interval for the slope
636     * estimate.
637     * <p>
638     * The 95% confidence interval is</p>
639     * <p>
640     * <code>(getSlope() - getSlopeConfidenceInterval(),
641     * getSlope() + getSlopeConfidenceInterval())</code></p>
642     * <p>
643     * If there are fewer that <strong>three</strong> observations in the
644     * model, or if there is no variation in x, this returns
645     * <code>Double.NaN</code>.</p>
646     * <p>
647     * <strong>Usage Note</strong>:<br>
648     * The validity of this statistic depends on the assumption that the
649     * observations included in the model are drawn from a
650     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
651     * Bivariate Normal Distribution</a>.</p>
652     *
653     * @return half-width of 95% confidence interval for the slope estimate
654     * @throws OutOfRangeException if the confidence interval can not be computed.
655     */
656    public double getSlopeConfidenceInterval() throws OutOfRangeException {
657        return getSlopeConfidenceInterval(0.05d);
658    }
659
660    /**
661     * Returns the half-width of a (100-100*alpha)% confidence interval for
662     * the slope estimate.
663     * <p>
664     * The (100-100*alpha)% confidence interval is </p>
665     * <p>
666     * <code>(getSlope() - getSlopeConfidenceInterval(),
667     * getSlope() + getSlopeConfidenceInterval())</code></p>
668     * <p>
669     * To request, for example, a 99% confidence interval, use
670     * <code>alpha = .01</code></p>
671     * <p>
672     * <strong>Usage Note</strong>:<br>
673     * The validity of this statistic depends on the assumption that the
674     * observations included in the model are drawn from a
675     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
676     * Bivariate Normal Distribution</a>.</p>
677     * <p>
678     * <strong> Preconditions:</strong><ul>
679     * <li>If there are fewer that <strong>three</strong> observations in the
680     * model, or if there is no variation in x, this returns
681     * <code>Double.NaN</code>.
682     * </li>
683     * <li>{@code (0 < alpha < 1)}; otherwise an
684     * <code>OutOfRangeException</code> is thrown.
685     * </li></ul>
686     *
687     * @param alpha the desired significance level
688     * @return half-width of 95% confidence interval for the slope estimate
689     * @throws OutOfRangeException if the confidence interval can not be computed.
690     */
691    public double getSlopeConfidenceInterval(final double alpha)
692    throws OutOfRangeException {
693        if (n < 3) {
694            return Double.NaN;
695        }
696        if (alpha >= 1 || alpha <= 0) {
697            throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
698                                          alpha, 0, 1);
699        }
700        // No advertised NotStrictlyPositiveException here - will return NaN above
701        TDistribution distribution = new TDistribution(n - 2);
702        return getSlopeStdErr() *
703            distribution.inverseCumulativeProbability(1d - alpha / 2d);
704    }
705
706    /**
707     * Returns the significance level of the slope (equiv) correlation.
708     * <p>
709     * Specifically, the returned value is the smallest <code>alpha</code>
710     * such that the slope confidence interval with significance level
711     * equal to <code>alpha</code> does not include <code>0</code>.
712     * On regression output, this is often denoted {@code Prob(|t| > 0)}
713     * </p><p>
714     * <strong>Usage Note</strong>:<br>
715     * The validity of this statistic depends on the assumption that the
716     * observations included in the model are drawn from a
717     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
718     * Bivariate Normal Distribution</a>.</p>
719     * <p>
720     * If there are fewer that <strong>three</strong> observations in the
721     * model, or if there is no variation in x, this returns
722     * <code>Double.NaN</code>.</p>
723     *
724     * @return significance level for slope/correlation
725     * @throws org.apache.commons.math4.exception.MaxCountExceededException
726     * if the significance level can not be computed.
727     */
728    public double getSignificance() {
729        if (n < 3) {
730            return Double.NaN;
731        }
732        // No advertised NotStrictlyPositiveException here - will return NaN above
733        TDistribution distribution = new TDistribution(n - 2);
734        return 2d * (1.0 - distribution.cumulativeProbability(
735                    FastMath.abs(getSlope()) / getSlopeStdErr()));
736    }
737
738    // ---------------------Private methods-----------------------------------
739
740    /**
741    * Returns the intercept of the estimated regression line, given the slope.
742    * <p>
743    * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
744    *
745    * @param slope current slope
746    * @return the intercept of the regression line
747    */
748    private double getIntercept(final double slope) {
749      if( hasIntercept){
750        return (sumY - slope * sumX) / n;
751      }
752      return 0.0;
753    }
754
755    /**
756     * Computes SSR from b1.
757     *
758     * @param slope regression slope estimate
759     * @return sum of squared deviations of predicted y values
760     */
761    private double getRegressionSumSquares(final double slope) {
762        return slope * slope * sumXX;
763    }
764
765    /**
766     * Performs a regression on data present in buffers and outputs a RegressionResults object.
767     *
768     * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
769     * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
770     * contain at least 2 observations.</p>
771     *
772     * @return RegressionResults acts as a container of regression output
773     * @throws ModelSpecificationException if the model is not correctly specified
774     * @throws NoDataException if there is not sufficient data in the model to
775     * estimate the regression parameters
776     */
777    @Override
778    public RegressionResults regress() throws ModelSpecificationException, NoDataException {
779        if (hasIntercept) {
780            if (n < 3) {
781                throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
782            }
783            if (FastMath.abs(sumXX) > Precision.SAFE_MIN) {
784                final double[] params = new double[] { getIntercept(), getSlope() };
785                final double mse = getMeanSquareError();
786                final double _syy = sumYY + sumY * sumY / n;
787                final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
788                return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true,
789                        false);
790            } else {
791                final double[] params = new double[] { sumY / n, Double.NaN };
792                // final double mse = getMeanSquareError();
793                final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN };
794                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
795                        false);
796            }
797        } else {
798            if (n < 2) {
799                throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
800            }
801            if (!Double.isNaN(sumXX)) {
802                final double[] vcv = new double[] { getMeanSquareError() / sumXX };
803                final double[] params = new double[] { sumXY / sumXX };
804                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
805                        false);
806            } else {
807                final double[] vcv = new double[] { Double.NaN };
808                final double[] params = new double[] { Double.NaN };
809                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
810                        false);
811            }
812        }
813    }
814
815    /**
816     * Performs a regression on data present in buffers including only regressors
817     * indexed in variablesToInclude and outputs a RegressionResults object
818     * @param variablesToInclude an array of indices of regressors to include
819     * @return RegressionResults acts as a container of regression output
820     * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
821     * @throws OutOfRangeException if a requested variable is not present in model
822     */
823    @Override
824    public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
825        if( variablesToInclude == null || variablesToInclude.length == 0){
826          throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
827        }
828        if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
829            throw new ModelSpecificationException(
830                    LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
831                    (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
832        }
833
834        if( hasIntercept ){
835            if( variablesToInclude.length == 2 ){
836                if( variablesToInclude[0] == 1 ){
837                    throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
838                }else if( variablesToInclude[0] != 0 ){
839                    throw new OutOfRangeException( variablesToInclude[0], 0,1 );
840                }
841                if( variablesToInclude[1] != 1){
842                     throw new OutOfRangeException( variablesToInclude[0], 0,1 );
843                }
844                return regress();
845            }else{
846                if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
847                     throw new OutOfRangeException( variablesToInclude[0],0,1 );
848                }
849                final double _mean = sumY * sumY / n;
850                final double _syy = sumYY + _mean;
851                if( variablesToInclude[0] == 0 ){
852                    //just the mean
853                    final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
854                    final double[] params = new double[]{ ybar };
855                    return new RegressionResults(
856                      params, new double[][]{vcv}, true, n, 1,
857                      sumY, _syy+_mean, sumYY,true,false);
858
859                }else if( variablesToInclude[0] == 1){
860                    //final double _syy = sumYY + sumY * sumY / ((double) n);
861                    final double _sxx = sumXX + sumX * sumX / n;
862                    final double _sxy = sumXY + sumX * sumY / n;
863                    final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
864                    final double _mse = _sse/((n-1));
865                    if( !Double.isNaN(_sxx) ){
866                        final double[] vcv = new double[]{ _mse / _sxx };
867                        final double[] params = new double[]{ _sxy/_sxx };
868                        return new RegressionResults(
869                                    params, new double[][]{vcv}, true, n, 1,
870                                    sumY, _syy, _sse,false,false);
871                    }else{
872                        final double[] vcv = new double[]{Double.NaN };
873                        final double[] params = new double[]{ Double.NaN };
874                        return new RegressionResults(
875                                    params, new double[][]{vcv}, true, n, 1,
876                                    Double.NaN, Double.NaN, Double.NaN,false,false);
877                    }
878                }
879            }
880        }else{
881            if( variablesToInclude[0] != 0 ){
882                throw new OutOfRangeException(variablesToInclude[0],0,0);
883            }
884            return regress();
885        }
886
887        return null;
888    }
889}