001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math4.legacy.stat.regression;
019
020import org.apache.commons.statistics.distribution.TDistribution;
021import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
022import org.apache.commons.math4.legacy.exception.NoDataException;
023import org.apache.commons.math4.legacy.exception.OutOfRangeException;
024import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
025import org.apache.commons.math4.core.jdkmath.JdkMath;
026import org.apache.commons.numbers.core.Precision;
027
028/**
029 * Estimates an ordinary least squares regression model
030 * with one independent variable.
031 * <p>
032 * <code> y = intercept + slope * x  </code></p>
033 * <p>
034 * Standard errors for <code>intercept</code> and <code>slope</code> are
035 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
036 * <p>
037 * Observations (x,y pairs) can be added to the model one at a time or they
038 * can be provided in a 2-dimensional array.  The observations are not stored
039 * in memory, so there is no limit to the number of observations that can be
040 * added to the model.</p>
041 * <p>
042 * <strong>Usage Notes</strong>: <ul>
043 * <li> When there are fewer than two observations in the model, or when
044 * there is no variation in the x values (i.e. all x values are the same)
045 * all statistics return <code>NaN</code>. At least two observations with
046 * different x coordinates are required to estimate a bivariate regression
047 * model.
048 * </li>
049 * <li> Getters for the statistics always compute values based on the current
050 * set of observations -- i.e., you can get statistics, then add more data
051 * and get updated statistics without using a new instance.  There is no
052 * "compute" method that updates all statistics.  Each of the getters performs
053 * the necessary computations to return the requested statistic.
054 * </li>
055 * <li> The intercept term may be suppressed by passing {@code false} to
056 * the {@link #SimpleRegression(boolean)} constructor.  When the
057 * {@code hasIntercept} property is false, the model is estimated without a
058 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
059 * </ul>
060 *
061 */
062public class SimpleRegression implements UpdatingMultipleLinearRegression {
063    /** sum of x values. */
064    private double sumX;
065
066    /** total variation in x (sum of squared deviations from xbar). */
067    private double sumXX;
068
069    /** sum of y values. */
070    private double sumY;
071
072    /** total variation in y (sum of squared deviations from ybar). */
073    private double sumYY;
074
075    /** sum of products. */
076    private double sumXY;
077
078    /** number of observations. */
079    private long n;
080
081    /** mean of accumulated x values, used in updating formulas. */
082    private double xbar;
083
084    /** mean of accumulated y values, used in updating formulas. */
085    private double ybar;
086
087    /** include an intercept or not. */
088    private final boolean hasIntercept;
089    // ---------------------Public methods--------------------------------------
090
091    /**
092     * Create an empty SimpleRegression instance.
093     */
094    public SimpleRegression() {
095        this(true);
096    }
097    /**
098    * Create a SimpleRegression instance, specifying whether or not to estimate
099    * an intercept.
100    *
101    * <p>Use {@code false} to estimate a model with no intercept.  When the
102    * {@code hasIntercept} property is false, the model is estimated without a
103    * constant term and {@link #getIntercept()} returns {@code 0}.</p>
104    *
105    * @param includeIntercept whether or not to include an intercept term in
106    * the regression model
107    */
108    public SimpleRegression(boolean includeIntercept) {
109        super();
110        hasIntercept = includeIntercept;
111    }
112
113    /**
114     * Adds the observation (x,y) to the regression data set.
115     * <p>
116     * Uses updating formulas for means and sums of squares defined in
117     * "Algorithms for Computing the Sample Variance: Analysis and
118     * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
119     * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
120     * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
121     *
122     *
123     * @param x independent variable value
124     * @param y dependent variable value
125     */
126    public void addData(final double x,final double y) {
127        if (n == 0) {
128            xbar = x;
129            ybar = y;
130        } else {
131            if( hasIntercept ){
132                final double fact1 = 1.0 + n;
133                final double fact2 = n / (1.0 + n);
134                final double dx = x - xbar;
135                final double dy = y - ybar;
136                sumXX += dx * dx * fact2;
137                sumYY += dy * dy * fact2;
138                sumXY += dx * dy * fact2;
139                xbar += dx / fact1;
140                ybar += dy / fact1;
141            }
142         }
143        if( !hasIntercept ){
144            sumXX += x * x ;
145            sumYY += y * y ;
146            sumXY += x * y ;
147        }
148        sumX += x;
149        sumY += y;
150        n++;
151    }
152
153    /**
154     * Appends data from another regression calculation to this one.
155     *
156     * <p>The mean update formulae are based on a paper written by Philippe
157     * P&eacute;bay:
158     * <a
159     * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
160     * Formulas for Robust, One-Pass Parallel Computation of Covariances and
161     * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
162     * SAND2008-6212, Sandia National Laboratories.</p>
163     *
164     * @param reg model to append data from
165     * @since 3.3
166     */
167    public void append(SimpleRegression reg) {
168        if (n == 0) {
169            xbar = reg.xbar;
170            ybar = reg.ybar;
171            sumXX = reg.sumXX;
172            sumYY = reg.sumYY;
173            sumXY = reg.sumXY;
174        } else {
175            if (hasIntercept) {
176                final double fact1 = reg.n / (double) (reg.n + n);
177                final double fact2 = n * reg.n / (double) (reg.n + n);
178                final double dx = reg.xbar - xbar;
179                final double dy = reg.ybar - ybar;
180                sumXX += reg.sumXX + dx * dx * fact2;
181                sumYY += reg.sumYY + dy * dy * fact2;
182                sumXY += reg.sumXY + dx * dy * fact2;
183                xbar += dx * fact1;
184                ybar += dy * fact1;
185            }else{
186                sumXX += reg.sumXX;
187                sumYY += reg.sumYY;
188                sumXY += reg.sumXY;
189            }
190        }
191        sumX += reg.sumX;
192        sumY += reg.sumY;
193        n += reg.n;
194    }
195
196    /**
197     * Removes the observation (x,y) from the regression data set.
198     * <p>
199     * Mirrors the addData method.  This method permits the use of
200     * SimpleRegression instances in streaming mode where the regression
201     * is applied to a sliding "window" of observations, however the caller is
202     * responsible for maintaining the set of observations in the window.</p>
203     *
204     * The method has no effect if there are no points of data (i.e. n=0)
205     *
206     * @param x independent variable value
207     * @param y dependent variable value
208     */
209    public void removeData(final double x,final double y) {
210        if (n > 0) {
211            if (hasIntercept) {
212                final double fact1 = n - 1.0;
213                final double fact2 = n / (n - 1.0);
214                final double dx = x - xbar;
215                final double dy = y - ybar;
216                sumXX -= dx * dx * fact2;
217                sumYY -= dy * dy * fact2;
218                sumXY -= dx * dy * fact2;
219                xbar -= dx / fact1;
220                ybar -= dy / fact1;
221            } else {
222                final double fact1 = n - 1.0;
223                sumXX -= x * x;
224                sumYY -= y * y;
225                sumXY -= x * y;
226                xbar -= x / fact1;
227                ybar -= y / fact1;
228            }
229             sumX -= x;
230             sumY -= y;
231             n--;
232        }
233    }
234
235    /**
236     * Adds the observations represented by the elements in
237     * <code>data</code>.
238     * <p>
239     * <code>(data[0][0],data[0][1])</code> will be the first observation, then
240     * <code>(data[1][0],data[1][1])</code>, etc.</p>
241     * <p>
242     * This method does not replace data that has already been added.  The
243     * observations represented by <code>data</code> are added to the existing
244     * dataset.</p>
245     * <p>
246     * To replace all data, use <code>clear()</code> before adding the new
247     * data.</p>
248     *
249     * @param data array of observations to be added
250     * @throws ModelSpecificationException if the length of {@code data[i]} is not
251     * greater than or equal to 2
252     */
253    public void addData(final double[][] data) throws ModelSpecificationException {
254        for (int i = 0; i < data.length; i++) {
255            if( data[i].length < 2 ){
256               throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
257                    data[i].length, 2);
258            }
259            addData(data[i][0], data[i][1]);
260        }
261    }
262
263    /**
264     * Adds one observation to the regression model.
265     *
266     * @param x the independent variables which form the design matrix
267     * @param y the dependent or response variable
268     * @throws ModelSpecificationException if the length of {@code x} does not equal
269     * the number of independent variables in the model
270     */
271    @Override
272    public void addObservation(final double[] x,final double y)
273            throws ModelSpecificationException {
274        if( x == null || x.length == 0 ){
275            throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
276        }
277        addData( x[0], y );
278    }
279
280    /**
281     * Adds a series of observations to the regression model. The lengths of
282     * x and y must be the same and x must be rectangular.
283     *
284     * @param x a series of observations on the independent variables
285     * @param y a series of observations on the dependent variable
286     * The length of x and y must be the same
287     * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
288     * the length of {@code y} or does not contain sufficient data to estimate the model
289     */
290    @Override
291    public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
292        if (x == null || y == null || x.length != y.length) {
293            throw new ModelSpecificationException(
294                  LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
295                  (x == null) ? 0 : x.length,
296                  (y == null) ? 0 : y.length);
297        }
298        boolean obsOk = true;
299        for( int i = 0 ; i < x.length; i++){
300            if( x[i] == null || x[i].length == 0 ){
301                obsOk = false;
302                break;
303            }
304        }
305        if( !obsOk ){
306            throw new ModelSpecificationException(
307                  LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
308                  0, 1);
309        }
310        for( int i = 0 ; i < x.length ; i++){
311            addData( x[i][0], y[i] );
312        }
313    }
314
315    /**
316     * Removes observations represented by the elements in <code>data</code>.
317      * <p>
318     * If the array is larger than the current n, only the first n elements are
319     * processed.  This method permits the use of SimpleRegression instances in
320     * streaming mode where the regression is applied to a sliding "window" of
321     * observations, however the caller is responsible for maintaining the set
322     * of observations in the window.</p>
323     * <p>
324     * To remove all data, use <code>clear()</code>.</p>
325     *
326     * @param data array of observations to be removed
327     */
328    public void removeData(double[][] data) {
329        for (int i = 0; i < data.length && n > 0; i++) {
330            removeData(data[i][0], data[i][1]);
331        }
332    }
333
334    /**
335     * Clears all data from the model.
336     */
337    @Override
338    public void clear() {
339        sumX = 0d;
340        sumXX = 0d;
341        sumY = 0d;
342        sumYY = 0d;
343        sumXY = 0d;
344        n = 0;
345    }
346
347    /**
348     * Returns the number of observations that have been added to the model.
349     *
350     * @return n number of observations that have been added.
351     */
352    @Override
353    public long getN() {
354        return n;
355    }
356
357    /**
358     * Returns the "predicted" <code>y</code> value associated with the
359     * supplied <code>x</code> value,  based on the data that has been
360     * added to the model when this method is activated.
361     * <p>
362     * <code> predict(x) = intercept + slope * x </code></p>
363     * <p>
364     * <strong>Preconditions</strong>: <ul>
365     * <li>At least two observations (with at least two different x values)
366     * must have been added before invoking this method. If this method is
367     * invoked before a model can be estimated, <code>Double,NaN</code> is
368     * returned.
369     * </li></ul>
370     *
371     * @param x input <code>x</code> value
372     * @return predicted <code>y</code> value
373     */
374    public double predict(final double x) {
375        final double b1 = getSlope();
376        if (hasIntercept) {
377            return getIntercept(b1) + b1 * x;
378        }
379        return b1 * x;
380    }
381
382    /**
383     * Returns the intercept of the estimated regression line, if
384     * {@link #hasIntercept()} is true; otherwise 0.
385     * <p>
386     * The least squares estimate of the intercept is computed using the
387     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
388     * The intercept is sometimes denoted b0.</p>
389     * <p>
390     * <strong>Preconditions</strong>: <ul>
391     * <li>At least two observations (with at least two different x values)
392     * must have been added before invoking this method. If this method is
393     * invoked before a model can be estimated, <code>Double,NaN</code> is
394     * returned.
395     * </li></ul>
396     *
397     * @return the intercept of the regression line if the model includes an
398     * intercept; 0 otherwise
399     * @see #SimpleRegression(boolean)
400     */
401    public double getIntercept() {
402        return hasIntercept ? getIntercept(getSlope()) : 0.0;
403    }
404
405    /**
406     * Returns true if the model includes an intercept term.
407     *
408     * @return true if the regression includes an intercept; false otherwise
409     * @see #SimpleRegression(boolean)
410     */
411    @Override
412    public boolean hasIntercept() {
413        return hasIntercept;
414    }
415
416    /**
417    * Returns the slope of the estimated regression line.
418    * <p>
419    * The least squares estimate of the slope is computed using the
420    * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
421    * The slope is sometimes denoted b1.</p>
422    * <p>
423    * <strong>Preconditions</strong>: <ul>
424    * <li>At least two observations (with at least two different x values)
425    * must have been added before invoking this method. If this method is
426    * invoked before a model can be estimated, <code>Double.NaN</code> is
427    * returned.
428    * </li></ul>
429    *
430    * @return the slope of the regression line
431    */
432    public double getSlope() {
433        if (n < 2) {
434            return Double.NaN; //not enough data
435        }
436        if (JdkMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
437            return Double.NaN; //not enough variation in x
438        }
439        return sumXY / sumXX;
440    }
441
442    /**
443     * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
444     * sum of squared errors</a> (SSE) associated with the regression
445     * model.
446     * <p>
447     * The sum is computed using the computational formula</p>
448     * <p>
449     * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
450     * <p>
451     * where <code>SYY</code> is the sum of the squared deviations of the y
452     * values about their mean, <code>SXX</code> is similarly defined and
453     * <code>SXY</code> is the sum of the products of x and y mean deviations.
454     * </p><p>
455     * The sums are accumulated using the updating algorithm referenced in
456     * {@link #addData}.</p>
457     * <p>
458     * The return value is constrained to be non-negative - i.e., if due to
459     * rounding errors the computational formula returns a negative result,
460     * 0 is returned.</p>
461     * <p>
462     * <strong>Preconditions</strong>: <ul>
463     * <li>At least two observations (with at least two different x values)
464     * must have been added before invoking this method. If this method is
465     * invoked before a model can be estimated, <code>Double,NaN</code> is
466     * returned.
467     * </li></ul>
468     *
469     * @return sum of squared errors associated with the regression model
470     */
471    public double getSumSquaredErrors() {
472        return JdkMath.max(0d, sumYY - sumXY * sumXY / sumXX);
473    }
474
475    /**
476     * Returns the sum of squared deviations of the y values about their mean.
477     * <p>
478     * This is defined as SSTO
479     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
480     * <p>
481     * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
482     *
483     * @return sum of squared deviations of y values
484     */
485    public double getTotalSumSquares() {
486        if (n < 2) {
487            return Double.NaN;
488        }
489        return sumYY;
490    }
491
492    /**
493     * Returns the sum of squared deviations of the x values about their mean.
494     *
495     * If <code>n &lt; 2</code>, this returns <code>Double.NaN</code>.
496     *
497     * @return sum of squared deviations of x values
498     */
499    public double getXSumSquares() {
500        if (n < 2) {
501            return Double.NaN;
502        }
503        return sumXX;
504    }
505
506    /**
507     * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
508     *
509     * @return sum of cross products
510     */
511    public double getSumOfCrossProducts() {
512        return sumXY;
513    }
514
515    /**
516     * Returns the sum of squared deviations of the predicted y values about
517     * their mean (which equals the mean of y).
518     * <p>
519     * This is usually abbreviated SSR or SSM.  It is defined as SSM
520     * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
521     * <p>
522     * <strong>Preconditions</strong>: <ul>
523     * <li>At least two observations (with at least two different x values)
524     * must have been added before invoking this method. If this method is
525     * invoked before a model can be estimated, <code>Double.NaN</code> is
526     * returned.
527     * </li></ul>
528     *
529     * @return sum of squared deviations of predicted y values
530     */
531    public double getRegressionSumSquares() {
532        return getRegressionSumSquares(getSlope());
533    }
534
535    /**
536     * Returns the sum of squared errors divided by the degrees of freedom,
537     * usually abbreviated MSE.
538     * <p>
539     * If there are fewer than <strong>three</strong> data pairs in the model,
540     * or if there is no variation in <code>x</code>, this returns
541     * <code>Double.NaN</code>.</p>
542     *
543     * @return sum of squared deviations of y values
544     */
545    public double getMeanSquareError() {
546        if (n < 3) {
547            return Double.NaN;
548        }
549        return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
550    }
551
552    /**
553     * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
554     * Pearson's product moment correlation coefficient</a>,
555     * usually denoted r.
556     * <p>
557     * <strong>Preconditions</strong>: <ul>
558     * <li>At least two observations (with at least two different x values)
559     * must have been added before invoking this method. If this method is
560     * invoked before a model can be estimated, <code>Double,NaN</code> is
561     * returned.
562     * </li></ul>
563     *
564     * @return Pearson's r
565     */
566    public double getR() {
567        double b1 = getSlope();
568        double result = JdkMath.sqrt(getRSquare());
569        if (b1 < 0) {
570            result = -result;
571        }
572        return result;
573    }
574
575    /**
576     * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
577     * coefficient of determination</a>,
578     * usually denoted r-square.
579     * <p>
580     * <strong>Preconditions</strong>: <ul>
581     * <li>At least two observations (with at least two different x values)
582     * must have been added before invoking this method. If this method is
583     * invoked before a model can be estimated, <code>Double,NaN</code> is
584     * returned.
585     * </li></ul>
586     *
587     * @return r-square
588     */
589    public double getRSquare() {
590        double ssto = getTotalSumSquares();
591        return (ssto - getSumSquaredErrors()) / ssto;
592    }
593
594    /**
595     * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
596     * standard error of the intercept estimate</a>,
597     * usually denoted s(b0).
598     * <p>
599     * If there are fewer that <strong>three</strong> observations in the
600     * model, or if there is no variation in x, this returns
601     * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
602     * returned when the intercept is constrained to be zero
603     *
604     * @return standard error associated with intercept estimate
605     */
606    public double getInterceptStdErr() {
607        if( !hasIntercept ){
608            return Double.NaN;
609        }
610        return JdkMath.sqrt(
611            getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
612    }
613
614    /**
615     * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
616     * error of the slope estimate</a>,
617     * usually denoted s(b1).
618     * <p>
619     * If there are fewer that <strong>three</strong> data pairs in the model,
620     * or if there is no variation in x, this returns <code>Double.NaN</code>.
621     * </p>
622     *
623     * @return standard error associated with slope estimate
624     */
625    public double getSlopeStdErr() {
626        return JdkMath.sqrt(getMeanSquareError() / sumXX);
627    }
628
629    /**
630     * Returns the half-width of a 95% confidence interval for the slope
631     * estimate.
632     * <p>
633     * The 95% confidence interval is</p>
634     * <p>
635     * <code>(getSlope() - getSlopeConfidenceInterval(),
636     * getSlope() + getSlopeConfidenceInterval())</code></p>
637     * <p>
638     * If there are fewer that <strong>three</strong> observations in the
639     * model, or if there is no variation in x, this returns
640     * <code>Double.NaN</code>.</p>
641     * <p>
642     * <strong>Usage Note</strong>:<br>
643     * The validity of this statistic depends on the assumption that the
644     * observations included in the model are drawn from a
645     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
646     * Bivariate Normal Distribution</a>.</p>
647     *
648     * @return half-width of 95% confidence interval for the slope estimate
649     * @throws OutOfRangeException if the confidence interval can not be computed.
650     */
651    public double getSlopeConfidenceInterval() throws OutOfRangeException {
652        return getSlopeConfidenceInterval(0.05d);
653    }
654
655    /**
656     * Returns the half-width of a (100-100*alpha)% confidence interval for
657     * the slope estimate.
658     * <p>
659     * The (100-100*alpha)% confidence interval is </p>
660     * <p>
661     * <code>(getSlope() - getSlopeConfidenceInterval(),
662     * getSlope() + getSlopeConfidenceInterval())</code></p>
663     * <p>
664     * To request, for example, a 99% confidence interval, use
665     * <code>alpha = .01</code></p>
666     * <p>
667     * <strong>Usage Note</strong>:<br>
668     * The validity of this statistic depends on the assumption that the
669     * observations included in the model are drawn from a
670     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
671     * Bivariate Normal Distribution</a>.</p>
672     * <p>
673     * <strong> Preconditions:</strong><ul>
674     * <li>If there are fewer that <strong>three</strong> observations in the
675     * model, or if there is no variation in x, this returns
676     * <code>Double.NaN</code>.
677     * </li>
678     * <li>{@code (0 < alpha < 1)}; otherwise an
679     * <code>OutOfRangeException</code> is thrown.
680     * </li></ul>
681     *
682     * @param alpha the desired significance level
683     * @return half-width of 95% confidence interval for the slope estimate
684     * @throws OutOfRangeException if the confidence interval can not be computed.
685     */
686    public double getSlopeConfidenceInterval(final double alpha)
687    throws OutOfRangeException {
688        if (n < 3) {
689            return Double.NaN;
690        }
691        if (alpha >= 1 || alpha <= 0) {
692            throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
693                                          alpha, 0, 1);
694        }
695        // No advertised NotStrictlyPositiveException here - will return NaN above
696        TDistribution distribution = TDistribution.of(n - 2d);
697        return getSlopeStdErr() *
698            distribution.inverseSurvivalProbability(alpha / 2d);
699    }
700
701    /**
702     * Returns the significance level of the slope (equiv) correlation.
703     * <p>
704     * Specifically, the returned value is the smallest <code>alpha</code>
705     * such that the slope confidence interval with significance level
706     * equal to <code>alpha</code> does not include <code>0</code>.
707     * On regression output, this is often denoted {@code Prob(|t| > 0)}
708     * </p><p>
709     * <strong>Usage Note</strong>:<br>
710     * The validity of this statistic depends on the assumption that the
711     * observations included in the model are drawn from a
712     * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
713     * Bivariate Normal Distribution</a>.</p>
714     * <p>
715     * If there are fewer that <strong>three</strong> observations in the
716     * model, or if there is no variation in x, this returns
717     * <code>Double.NaN</code>.</p>
718     *
719     * @return significance level for slope/correlation
720     * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
721     * if the significance level can not be computed.
722     */
723    public double getSignificance() {
724        if (n < 3) {
725            return Double.NaN;
726        }
727        // No advertised NotStrictlyPositiveException here - will return NaN above
728        TDistribution distribution = TDistribution.of(n - 2d);
729        return 2d * (distribution.survivalProbability(
730                    JdkMath.abs(getSlope()) / getSlopeStdErr()));
731    }
732
733    // ---------------------Private methods-----------------------------------
734
735    /**
736    * Returns the intercept of the estimated regression line, given the slope.
737    * <p>
738    * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
739    *
740    * @param slope current slope
741    * @return the intercept of the regression line
742    */
743    private double getIntercept(final double slope) {
744      if( hasIntercept){
745        return (sumY - slope * sumX) / n;
746      }
747      return 0.0;
748    }
749
750    /**
751     * Computes SSR from b1.
752     *
753     * @param slope regression slope estimate
754     * @return sum of squared deviations of predicted y values
755     */
756    private double getRegressionSumSquares(final double slope) {
757        return slope * slope * sumXX;
758    }
759
760    /**
761     * Performs a regression on data present in buffers and outputs a RegressionResults object.
762     *
763     * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
764     * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
765     * contain at least 2 observations.</p>
766     *
767     * @return RegressionResults acts as a container of regression output
768     * @throws ModelSpecificationException if the model is not correctly specified
769     * @throws NoDataException if there is not sufficient data in the model to
770     * estimate the regression parameters
771     */
772    @Override
773    public RegressionResults regress() throws ModelSpecificationException, NoDataException {
774        if (hasIntercept) {
775            if (n < 3) {
776                throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
777            }
778            if (JdkMath.abs(sumXX) > Precision.SAFE_MIN) {
779                final double[] params = new double[] { getIntercept(), getSlope() };
780                final double mse = getMeanSquareError();
781                final double syy = sumYY + sumY * sumY / n;
782                final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
783                return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, syy, getSumSquaredErrors(), true,
784                        false);
785            } else {
786                final double[] params = new double[] { sumY / n, Double.NaN };
787                // final double mse = getMeanSquareError();
788                final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN };
789                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
790                        false);
791            }
792        } else {
793            if (n < 2) {
794                throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
795            }
796            if (!Double.isNaN(sumXX)) {
797                final double[] vcv = new double[] { getMeanSquareError() / sumXX };
798                final double[] params = new double[] { sumXY / sumXX };
799                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
800                        false);
801            } else {
802                final double[] vcv = new double[] { Double.NaN };
803                final double[] params = new double[] { Double.NaN };
804                return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
805                        false);
806            }
807        }
808    }
809
810    /**
811     * Performs a regression on data present in buffers including only regressors.
812     * indexed in variablesToInclude and outputs a RegressionResults object
813     * @param variablesToInclude an array of indices of regressors to include
814     * @return RegressionResults acts as a container of regression output
815     * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
816     * @throws OutOfRangeException if a requested variable is not present in model
817     */
818    @Override
819    public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
820        if( variablesToInclude == null || variablesToInclude.length == 0){
821          throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
822        }
823        if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
824            throw new ModelSpecificationException(
825                    LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
826                    (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
827        }
828
829        if( hasIntercept ){
830            if( variablesToInclude.length == 2 ){
831                if( variablesToInclude[0] == 1 ){
832                    throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
833                }else if( variablesToInclude[0] != 0 ){
834                    throw new OutOfRangeException( variablesToInclude[0], 0,1 );
835                }
836                if( variablesToInclude[1] != 1){
837                     throw new OutOfRangeException( variablesToInclude[0], 0,1 );
838                }
839                return regress();
840            }else{
841                if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
842                     throw new OutOfRangeException( variablesToInclude[0],0,1 );
843                }
844                final double mean = sumY * sumY / n;
845                final double syy = sumYY + mean;
846                if( variablesToInclude[0] == 0 ){
847                    //just the mean
848                    final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
849                    final double[] params = new double[]{ ybar };
850                    return new RegressionResults(
851                      params, new double[][]{vcv}, true, n, 1,
852                      sumY, syy+mean, sumYY,true,false);
853                }else if( variablesToInclude[0] == 1){
854                    //final double _syy = sumYY + sumY * sumY / ((double) n);
855                    final double sxx = sumXX + sumX * sumX / n;
856                    final double sxy = sumXY + sumX * sumY / n;
857                    final double sse = JdkMath.max(0d, syy - sxy * sxy / sxx);
858                    final double mse = sse/((n-1));
859                    if( !Double.isNaN(sxx) ){
860                        final double[] vcv = new double[]{ mse / sxx };
861                        final double[] params = new double[]{ sxy/sxx };
862                        return new RegressionResults(
863                                    params, new double[][]{vcv}, true, n, 1,
864                                    sumY, syy, sse,false,false);
865                    }else{
866                        final double[] vcv = new double[]{Double.NaN };
867                        final double[] params = new double[]{ Double.NaN };
868                        return new RegressionResults(
869                                    params, new double[][]{vcv}, true, n, 1,
870                                    Double.NaN, Double.NaN, Double.NaN,false,false);
871                    }
872                }
873            }
874        }else{
875            if( variablesToInclude[0] != 0 ){
876                throw new OutOfRangeException(variablesToInclude[0],0,0);
877            }
878            return regress();
879        }
880
881        return null;
882    }
883}