View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math4.legacy.stat.regression;
19  
20  import org.apache.commons.statistics.distribution.TDistribution;
21  import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
22  import org.apache.commons.math4.legacy.exception.NoDataException;
23  import org.apache.commons.math4.legacy.exception.OutOfRangeException;
24  import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
25  import org.apache.commons.math4.core.jdkmath.JdkMath;
26  import org.apache.commons.numbers.core.Precision;
27  
28  /**
29   * Estimates an ordinary least squares regression model
30   * with one independent variable.
31   * <p>
32   * <code> y = intercept + slope * x  </code></p>
33   * <p>
34   * Standard errors for <code>intercept</code> and <code>slope</code> are
35   * available as well as ANOVA, r-square and Pearson's r statistics.</p>
36   * <p>
37   * Observations (x,y pairs) can be added to the model one at a time or they
38   * can be provided in a 2-dimensional array.  The observations are not stored
39   * in memory, so there is no limit to the number of observations that can be
40   * added to the model.</p>
41   * <p>
42   * <strong>Usage Notes</strong>: <ul>
43   * <li> When there are fewer than two observations in the model, or when
44   * there is no variation in the x values (i.e. all x values are the same)
45   * all statistics return <code>NaN</code>. At least two observations with
46   * different x coordinates are required to estimate a bivariate regression
47   * model.
48   * </li>
49   * <li> Getters for the statistics always compute values based on the current
50   * set of observations -- i.e., you can get statistics, then add more data
51   * and get updated statistics without using a new instance.  There is no
52   * "compute" method that updates all statistics.  Each of the getters performs
53   * the necessary computations to return the requested statistic.
54   * </li>
55   * <li> The intercept term may be suppressed by passing {@code false} to
56   * the {@link #SimpleRegression(boolean)} constructor.  When the
57   * {@code hasIntercept} property is false, the model is estimated without a
58   * constant term and {@link #getIntercept()} returns {@code 0}.</li>
59   * </ul>
60   *
61   */
62  public class SimpleRegression implements UpdatingMultipleLinearRegression {
63      /** sum of x values. */
64      private double sumX;
65  
66      /** total variation in x (sum of squared deviations from xbar). */
67      private double sumXX;
68  
69      /** sum of y values. */
70      private double sumY;
71  
72      /** total variation in y (sum of squared deviations from ybar). */
73      private double sumYY;
74  
75      /** sum of products. */
76      private double sumXY;
77  
78      /** number of observations. */
79      private long n;
80  
81      /** mean of accumulated x values, used in updating formulas. */
82      private double xbar;
83  
84      /** mean of accumulated y values, used in updating formulas. */
85      private double ybar;
86  
87      /** include an intercept or not. */
88      private final boolean hasIntercept;
89      // ---------------------Public methods--------------------------------------
90  
91      /**
92       * Create an empty SimpleRegression instance.
93       */
94      public SimpleRegression() {
95          this(true);
96      }
97      /**
98      * Create a SimpleRegression instance, specifying whether or not to estimate
99      * an intercept.
100     *
101     * <p>Use {@code false} to estimate a model with no intercept.  When the
102     * {@code hasIntercept} property is false, the model is estimated without a
103     * constant term and {@link #getIntercept()} returns {@code 0}.</p>
104     *
105     * @param includeIntercept whether or not to include an intercept term in
106     * the regression model
107     */
108     public SimpleRegression(boolean includeIntercept) {
109         super();
110         hasIntercept = includeIntercept;
111     }
112 
113     /**
114      * Adds the observation (x,y) to the regression data set.
115      * <p>
116      * Uses updating formulas for means and sums of squares defined in
117      * "Algorithms for Computing the Sample Variance: Analysis and
118      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
119      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
120      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
121      *
122      *
123      * @param x independent variable value
124      * @param y dependent variable value
125      */
126     public void addData(final double x,final double y) {
127         if (n == 0) {
128             xbar = x;
129             ybar = y;
130         } else {
131             if( hasIntercept ){
132                 final double fact1 = 1.0 + n;
133                 final double fact2 = n / (1.0 + n);
134                 final double dx = x - xbar;
135                 final double dy = y - ybar;
136                 sumXX += dx * dx * fact2;
137                 sumYY += dy * dy * fact2;
138                 sumXY += dx * dy * fact2;
139                 xbar += dx / fact1;
140                 ybar += dy / fact1;
141             }
142          }
143         if( !hasIntercept ){
144             sumXX += x * x ;
145             sumYY += y * y ;
146             sumXY += x * y ;
147         }
148         sumX += x;
149         sumY += y;
150         n++;
151     }
152 
153     /**
154      * Appends data from another regression calculation to this one.
155      *
156      * <p>The mean update formulae are based on a paper written by Philippe
157      * P&eacute;bay:
158      * <a
159      * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
160      * Formulas for Robust, One-Pass Parallel Computation of Covariances and
161      * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
162      * SAND2008-6212, Sandia National Laboratories.</p>
163      *
164      * @param reg model to append data from
165      * @since 3.3
166      */
167     public void append(SimpleRegression reg) {
168         if (n == 0) {
169             xbar = reg.xbar;
170             ybar = reg.ybar;
171             sumXX = reg.sumXX;
172             sumYY = reg.sumYY;
173             sumXY = reg.sumXY;
174         } else {
175             if (hasIntercept) {
176                 final double fact1 = reg.n / (double) (reg.n + n);
177                 final double fact2 = n * reg.n / (double) (reg.n + n);
178                 final double dx = reg.xbar - xbar;
179                 final double dy = reg.ybar - ybar;
180                 sumXX += reg.sumXX + dx * dx * fact2;
181                 sumYY += reg.sumYY + dy * dy * fact2;
182                 sumXY += reg.sumXY + dx * dy * fact2;
183                 xbar += dx * fact1;
184                 ybar += dy * fact1;
185             }else{
186                 sumXX += reg.sumXX;
187                 sumYY += reg.sumYY;
188                 sumXY += reg.sumXY;
189             }
190         }
191         sumX += reg.sumX;
192         sumY += reg.sumY;
193         n += reg.n;
194     }
195 
196     /**
197      * Removes the observation (x,y) from the regression data set.
198      * <p>
199      * Mirrors the addData method.  This method permits the use of
200      * SimpleRegression instances in streaming mode where the regression
201      * is applied to a sliding "window" of observations, however the caller is
202      * responsible for maintaining the set of observations in the window.</p>
203      *
204      * The method has no effect if there are no points of data (i.e. n=0)
205      *
206      * @param x independent variable value
207      * @param y dependent variable value
208      */
209     public void removeData(final double x,final double y) {
210         if (n > 0) {
211             if (hasIntercept) {
212                 final double fact1 = n - 1.0;
213                 final double fact2 = n / (n - 1.0);
214                 final double dx = x - xbar;
215                 final double dy = y - ybar;
216                 sumXX -= dx * dx * fact2;
217                 sumYY -= dy * dy * fact2;
218                 sumXY -= dx * dy * fact2;
219                 xbar -= dx / fact1;
220                 ybar -= dy / fact1;
221             } else {
222                 final double fact1 = n - 1.0;
223                 sumXX -= x * x;
224                 sumYY -= y * y;
225                 sumXY -= x * y;
226                 xbar -= x / fact1;
227                 ybar -= y / fact1;
228             }
229              sumX -= x;
230              sumY -= y;
231              n--;
232         }
233     }
234 
235     /**
236      * Adds the observations represented by the elements in
237      * <code>data</code>.
238      * <p>
239      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
240      * <code>(data[1][0],data[1][1])</code>, etc.</p>
241      * <p>
242      * This method does not replace data that has already been added.  The
243      * observations represented by <code>data</code> are added to the existing
244      * dataset.</p>
245      * <p>
246      * To replace all data, use <code>clear()</code> before adding the new
247      * data.</p>
248      *
249      * @param data array of observations to be added
250      * @throws ModelSpecificationException if the length of {@code data[i]} is not
251      * greater than or equal to 2
252      */
253     public void addData(final double[][] data) throws ModelSpecificationException {
254         for (int i = 0; i < data.length; i++) {
255             if( data[i].length < 2 ){
256                throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
257                     data[i].length, 2);
258             }
259             addData(data[i][0], data[i][1]);
260         }
261     }
262 
263     /**
264      * Adds one observation to the regression model.
265      *
266      * @param x the independent variables which form the design matrix
267      * @param y the dependent or response variable
268      * @throws ModelSpecificationException if the length of {@code x} does not equal
269      * the number of independent variables in the model
270      */
271     @Override
272     public void addObservation(final double[] x,final double y)
273             throws ModelSpecificationException {
274         if( x == null || x.length == 0 ){
275             throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
276         }
277         addData( x[0], y );
278     }
279 
280     /**
281      * Adds a series of observations to the regression model. The lengths of
282      * x and y must be the same and x must be rectangular.
283      *
284      * @param x a series of observations on the independent variables
285      * @param y a series of observations on the dependent variable
286      * The length of x and y must be the same
287      * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
288      * the length of {@code y} or does not contain sufficient data to estimate the model
289      */
290     @Override
291     public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
292         if (x == null || y == null || x.length != y.length) {
293             throw new ModelSpecificationException(
294                   LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
295                   (x == null) ? 0 : x.length,
296                   (y == null) ? 0 : y.length);
297         }
298         boolean obsOk = true;
299         for( int i = 0 ; i < x.length; i++){
300             if( x[i] == null || x[i].length == 0 ){
301                 obsOk = false;
302                 break;
303             }
304         }
305         if( !obsOk ){
306             throw new ModelSpecificationException(
307                   LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
308                   0, 1);
309         }
310         for( int i = 0 ; i < x.length ; i++){
311             addData( x[i][0], y[i] );
312         }
313     }
314 
315     /**
316      * Removes observations represented by the elements in <code>data</code>.
317       * <p>
318      * If the array is larger than the current n, only the first n elements are
319      * processed.  This method permits the use of SimpleRegression instances in
320      * streaming mode where the regression is applied to a sliding "window" of
321      * observations, however the caller is responsible for maintaining the set
322      * of observations in the window.</p>
323      * <p>
324      * To remove all data, use <code>clear()</code>.</p>
325      *
326      * @param data array of observations to be removed
327      */
328     public void removeData(double[][] data) {
329         for (int i = 0; i < data.length && n > 0; i++) {
330             removeData(data[i][0], data[i][1]);
331         }
332     }
333 
334     /**
335      * Clears all data from the model.
336      */
337     @Override
338     public void clear() {
339         sumX = 0d;
340         sumXX = 0d;
341         sumY = 0d;
342         sumYY = 0d;
343         sumXY = 0d;
344         n = 0;
345     }
346 
347     /**
348      * Returns the number of observations that have been added to the model.
349      *
350      * @return n number of observations that have been added.
351      */
352     @Override
353     public long getN() {
354         return n;
355     }
356 
357     /**
358      * Returns the "predicted" <code>y</code> value associated with the
359      * supplied <code>x</code> value,  based on the data that has been
360      * added to the model when this method is activated.
361      * <p>
362      * <code> predict(x) = intercept + slope * x </code></p>
363      * <p>
364      * <strong>Preconditions</strong>: <ul>
365      * <li>At least two observations (with at least two different x values)
366      * must have been added before invoking this method. If this method is
367      * invoked before a model can be estimated, <code>Double,NaN</code> is
368      * returned.
369      * </li></ul>
370      *
371      * @param x input <code>x</code> value
372      * @return predicted <code>y</code> value
373      */
374     public double predict(final double x) {
375         final double b1 = getSlope();
376         if (hasIntercept) {
377             return getIntercept(b1) + b1 * x;
378         }
379         return b1 * x;
380     }
381 
382     /**
383      * Returns the intercept of the estimated regression line, if
384      * {@link #hasIntercept()} is true; otherwise 0.
385      * <p>
386      * The least squares estimate of the intercept is computed using the
387      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
388      * The intercept is sometimes denoted b0.</p>
389      * <p>
390      * <strong>Preconditions</strong>: <ul>
391      * <li>At least two observations (with at least two different x values)
392      * must have been added before invoking this method. If this method is
393      * invoked before a model can be estimated, <code>Double,NaN</code> is
394      * returned.
395      * </li></ul>
396      *
397      * @return the intercept of the regression line if the model includes an
398      * intercept; 0 otherwise
399      * @see #SimpleRegression(boolean)
400      */
401     public double getIntercept() {
402         return hasIntercept ? getIntercept(getSlope()) : 0.0;
403     }
404 
405     /**
406      * Returns true if the model includes an intercept term.
407      *
408      * @return true if the regression includes an intercept; false otherwise
409      * @see #SimpleRegression(boolean)
410      */
411     @Override
412     public boolean hasIntercept() {
413         return hasIntercept;
414     }
415 
416     /**
417     * Returns the slope of the estimated regression line.
418     * <p>
419     * The least squares estimate of the slope is computed using the
420     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
421     * The slope is sometimes denoted b1.</p>
422     * <p>
423     * <strong>Preconditions</strong>: <ul>
424     * <li>At least two observations (with at least two different x values)
425     * must have been added before invoking this method. If this method is
426     * invoked before a model can be estimated, <code>Double.NaN</code> is
427     * returned.
428     * </li></ul>
429     *
430     * @return the slope of the regression line
431     */
432     public double getSlope() {
433         if (n < 2) {
434             return Double.NaN; //not enough data
435         }
436         if (JdkMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
437             return Double.NaN; //not enough variation in x
438         }
439         return sumXY / sumXX;
440     }
441 
442     /**
443      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
444      * sum of squared errors</a> (SSE) associated with the regression
445      * model.
446      * <p>
447      * The sum is computed using the computational formula</p>
448      * <p>
449      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
450      * <p>
451      * where <code>SYY</code> is the sum of the squared deviations of the y
452      * values about their mean, <code>SXX</code> is similarly defined and
453      * <code>SXY</code> is the sum of the products of x and y mean deviations.
454      * </p><p>
455      * The sums are accumulated using the updating algorithm referenced in
456      * {@link #addData}.</p>
457      * <p>
458      * The return value is constrained to be non-negative - i.e., if due to
459      * rounding errors the computational formula returns a negative result,
460      * 0 is returned.</p>
461      * <p>
462      * <strong>Preconditions</strong>: <ul>
463      * <li>At least two observations (with at least two different x values)
464      * must have been added before invoking this method. If this method is
465      * invoked before a model can be estimated, <code>Double,NaN</code> is
466      * returned.
467      * </li></ul>
468      *
469      * @return sum of squared errors associated with the regression model
470      */
471     public double getSumSquaredErrors() {
472         return JdkMath.max(0d, sumYY - sumXY * sumXY / sumXX);
473     }
474 
475     /**
476      * Returns the sum of squared deviations of the y values about their mean.
477      * <p>
478      * This is defined as SSTO
479      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
480      * <p>
481      * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
482      *
483      * @return sum of squared deviations of y values
484      */
485     public double getTotalSumSquares() {
486         if (n < 2) {
487             return Double.NaN;
488         }
489         return sumYY;
490     }
491 
492     /**
493      * Returns the sum of squared deviations of the x values about their mean.
494      *
495      * If <code>n &lt; 2</code>, this returns <code>Double.NaN</code>.
496      *
497      * @return sum of squared deviations of x values
498      */
499     public double getXSumSquares() {
500         if (n < 2) {
501             return Double.NaN;
502         }
503         return sumXX;
504     }
505 
506     /**
507      * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
508      *
509      * @return sum of cross products
510      */
511     public double getSumOfCrossProducts() {
512         return sumXY;
513     }
514 
515     /**
516      * Returns the sum of squared deviations of the predicted y values about
517      * their mean (which equals the mean of y).
518      * <p>
519      * This is usually abbreviated SSR or SSM.  It is defined as SSM
520      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
521      * <p>
522      * <strong>Preconditions</strong>: <ul>
523      * <li>At least two observations (with at least two different x values)
524      * must have been added before invoking this method. If this method is
525      * invoked before a model can be estimated, <code>Double.NaN</code> is
526      * returned.
527      * </li></ul>
528      *
529      * @return sum of squared deviations of predicted y values
530      */
531     public double getRegressionSumSquares() {
532         return getRegressionSumSquares(getSlope());
533     }
534 
535     /**
536      * Returns the sum of squared errors divided by the degrees of freedom,
537      * usually abbreviated MSE.
538      * <p>
539      * If there are fewer than <strong>three</strong> data pairs in the model,
540      * or if there is no variation in <code>x</code>, this returns
541      * <code>Double.NaN</code>.</p>
542      *
543      * @return sum of squared deviations of y values
544      */
545     public double getMeanSquareError() {
546         if (n < 3) {
547             return Double.NaN;
548         }
549         return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
550     }
551 
552     /**
553      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
554      * Pearson's product moment correlation coefficient</a>,
555      * usually denoted r.
556      * <p>
557      * <strong>Preconditions</strong>: <ul>
558      * <li>At least two observations (with at least two different x values)
559      * must have been added before invoking this method. If this method is
560      * invoked before a model can be estimated, <code>Double,NaN</code> is
561      * returned.
562      * </li></ul>
563      *
564      * @return Pearson's r
565      */
566     public double getR() {
567         double b1 = getSlope();
568         double result = JdkMath.sqrt(getRSquare());
569         if (b1 < 0) {
570             result = -result;
571         }
572         return result;
573     }
574 
575     /**
576      * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
577      * coefficient of determination</a>,
578      * usually denoted r-square.
579      * <p>
580      * <strong>Preconditions</strong>: <ul>
581      * <li>At least two observations (with at least two different x values)
582      * must have been added before invoking this method. If this method is
583      * invoked before a model can be estimated, <code>Double,NaN</code> is
584      * returned.
585      * </li></ul>
586      *
587      * @return r-square
588      */
589     public double getRSquare() {
590         double ssto = getTotalSumSquares();
591         return (ssto - getSumSquaredErrors()) / ssto;
592     }
593 
594     /**
595      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
596      * standard error of the intercept estimate</a>,
597      * usually denoted s(b0).
598      * <p>
599      * If there are fewer that <strong>three</strong> observations in the
600      * model, or if there is no variation in x, this returns
601      * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
602      * returned when the intercept is constrained to be zero
603      *
604      * @return standard error associated with intercept estimate
605      */
606     public double getInterceptStdErr() {
607         if( !hasIntercept ){
608             return Double.NaN;
609         }
610         return JdkMath.sqrt(
611             getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
612     }
613 
614     /**
615      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
616      * error of the slope estimate</a>,
617      * usually denoted s(b1).
618      * <p>
619      * If there are fewer that <strong>three</strong> data pairs in the model,
620      * or if there is no variation in x, this returns <code>Double.NaN</code>.
621      * </p>
622      *
623      * @return standard error associated with slope estimate
624      */
625     public double getSlopeStdErr() {
626         return JdkMath.sqrt(getMeanSquareError() / sumXX);
627     }
628 
629     /**
630      * Returns the half-width of a 95% confidence interval for the slope
631      * estimate.
632      * <p>
633      * The 95% confidence interval is</p>
634      * <p>
635      * <code>(getSlope() - getSlopeConfidenceInterval(),
636      * getSlope() + getSlopeConfidenceInterval())</code></p>
637      * <p>
638      * If there are fewer that <strong>three</strong> observations in the
639      * model, or if there is no variation in x, this returns
640      * <code>Double.NaN</code>.</p>
641      * <p>
642      * <strong>Usage Note</strong>:<br>
643      * The validity of this statistic depends on the assumption that the
644      * observations included in the model are drawn from a
645      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
646      * Bivariate Normal Distribution</a>.</p>
647      *
648      * @return half-width of 95% confidence interval for the slope estimate
649      * @throws OutOfRangeException if the confidence interval can not be computed.
650      */
651     public double getSlopeConfidenceInterval() throws OutOfRangeException {
652         return getSlopeConfidenceInterval(0.05d);
653     }
654 
655     /**
656      * Returns the half-width of a (100-100*alpha)% confidence interval for
657      * the slope estimate.
658      * <p>
659      * The (100-100*alpha)% confidence interval is </p>
660      * <p>
661      * <code>(getSlope() - getSlopeConfidenceInterval(),
662      * getSlope() + getSlopeConfidenceInterval())</code></p>
663      * <p>
664      * To request, for example, a 99% confidence interval, use
665      * <code>alpha = .01</code></p>
666      * <p>
667      * <strong>Usage Note</strong>:<br>
668      * The validity of this statistic depends on the assumption that the
669      * observations included in the model are drawn from a
670      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
671      * Bivariate Normal Distribution</a>.</p>
672      * <p>
673      * <strong> Preconditions:</strong><ul>
674      * <li>If there are fewer that <strong>three</strong> observations in the
675      * model, or if there is no variation in x, this returns
676      * <code>Double.NaN</code>.
677      * </li>
678      * <li>{@code (0 < alpha < 1)}; otherwise an
679      * <code>OutOfRangeException</code> is thrown.
680      * </li></ul>
681      *
682      * @param alpha the desired significance level
683      * @return half-width of 95% confidence interval for the slope estimate
684      * @throws OutOfRangeException if the confidence interval can not be computed.
685      */
686     public double getSlopeConfidenceInterval(final double alpha)
687     throws OutOfRangeException {
688         if (n < 3) {
689             return Double.NaN;
690         }
691         if (alpha >= 1 || alpha <= 0) {
692             throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
693                                           alpha, 0, 1);
694         }
695         // No advertised NotStrictlyPositiveException here - will return NaN above
696         TDistribution distribution = TDistribution.of(n - 2d);
697         return getSlopeStdErr() *
698             distribution.inverseSurvivalProbability(alpha / 2d);
699     }
700 
701     /**
702      * Returns the significance level of the slope (equiv) correlation.
703      * <p>
704      * Specifically, the returned value is the smallest <code>alpha</code>
705      * such that the slope confidence interval with significance level
706      * equal to <code>alpha</code> does not include <code>0</code>.
707      * On regression output, this is often denoted {@code Prob(|t| > 0)}
708      * </p><p>
709      * <strong>Usage Note</strong>:<br>
710      * The validity of this statistic depends on the assumption that the
711      * observations included in the model are drawn from a
712      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
713      * Bivariate Normal Distribution</a>.</p>
714      * <p>
715      * If there are fewer that <strong>three</strong> observations in the
716      * model, or if there is no variation in x, this returns
717      * <code>Double.NaN</code>.</p>
718      *
719      * @return significance level for slope/correlation
720      * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
721      * if the significance level can not be computed.
722      */
723     public double getSignificance() {
724         if (n < 3) {
725             return Double.NaN;
726         }
727         // No advertised NotStrictlyPositiveException here - will return NaN above
728         TDistribution distribution = TDistribution.of(n - 2d);
729         return 2d * (distribution.survivalProbability(
730                     JdkMath.abs(getSlope()) / getSlopeStdErr()));
731     }
732 
733     // ---------------------Private methods-----------------------------------
734 
735     /**
736     * Returns the intercept of the estimated regression line, given the slope.
737     * <p>
738     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
739     *
740     * @param slope current slope
741     * @return the intercept of the regression line
742     */
743     private double getIntercept(final double slope) {
744       if( hasIntercept){
745         return (sumY - slope * sumX) / n;
746       }
747       return 0.0;
748     }
749 
750     /**
751      * Computes SSR from b1.
752      *
753      * @param slope regression slope estimate
754      * @return sum of squared deviations of predicted y values
755      */
756     private double getRegressionSumSquares(final double slope) {
757         return slope * slope * sumXX;
758     }
759 
760     /**
761      * Performs a regression on data present in buffers and outputs a RegressionResults object.
762      *
763      * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
764      * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
765      * contain at least 2 observations.</p>
766      *
767      * @return RegressionResults acts as a container of regression output
768      * @throws ModelSpecificationException if the model is not correctly specified
769      * @throws NoDataException if there is not sufficient data in the model to
770      * estimate the regression parameters
771      */
772     @Override
773     public RegressionResults regress() throws ModelSpecificationException, NoDataException {
774         if (hasIntercept) {
775             if (n < 3) {
776                 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
777             }
778             if (JdkMath.abs(sumXX) > Precision.SAFE_MIN) {
779                 final double[] params = new double[] { getIntercept(), getSlope() };
780                 final double mse = getMeanSquareError();
781                 final double syy = sumYY + sumY * sumY / n;
782                 final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
783                 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, syy, getSumSquaredErrors(), true,
784                         false);
785             } else {
786                 final double[] params = new double[] { sumY / n, Double.NaN };
787                 // final double mse = getMeanSquareError();
788                 final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN };
789                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
790                         false);
791             }
792         } else {
793             if (n < 2) {
794                 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
795             }
796             if (!Double.isNaN(sumXX)) {
797                 final double[] vcv = new double[] { getMeanSquareError() / sumXX };
798                 final double[] params = new double[] { sumXY / sumXX };
799                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
800                         false);
801             } else {
802                 final double[] vcv = new double[] { Double.NaN };
803                 final double[] params = new double[] { Double.NaN };
804                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
805                         false);
806             }
807         }
808     }
809 
810     /**
811      * Performs a regression on data present in buffers including only regressors.
812      * indexed in variablesToInclude and outputs a RegressionResults object
813      * @param variablesToInclude an array of indices of regressors to include
814      * @return RegressionResults acts as a container of regression output
815      * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
816      * @throws OutOfRangeException if a requested variable is not present in model
817      */
818     @Override
819     public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
820         if( variablesToInclude == null || variablesToInclude.length == 0){
821           throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
822         }
823         if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
824             throw new ModelSpecificationException(
825                     LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
826                     (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
827         }
828 
829         if( hasIntercept ){
830             if( variablesToInclude.length == 2 ){
831                 if( variablesToInclude[0] == 1 ){
832                     throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
833                 }else if( variablesToInclude[0] != 0 ){
834                     throw new OutOfRangeException( variablesToInclude[0], 0,1 );
835                 }
836                 if( variablesToInclude[1] != 1){
837                      throw new OutOfRangeException( variablesToInclude[0], 0,1 );
838                 }
839                 return regress();
840             }else{
841                 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
842                      throw new OutOfRangeException( variablesToInclude[0],0,1 );
843                 }
844                 final double mean = sumY * sumY / n;
845                 final double syy = sumYY + mean;
846                 if( variablesToInclude[0] == 0 ){
847                     //just the mean
848                     final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
849                     final double[] params = new double[]{ ybar };
850                     return new RegressionResults(
851                       params, new double[][]{vcv}, true, n, 1,
852                       sumY, syy+mean, sumYY,true,false);
853                 }else if( variablesToInclude[0] == 1){
854                     //final double _syy = sumYY + sumY * sumY / ((double) n);
855                     final double sxx = sumXX + sumX * sumX / n;
856                     final double sxy = sumXY + sumX * sumY / n;
857                     final double sse = JdkMath.max(0d, syy - sxy * sxy / sxx);
858                     final double mse = sse/((n-1));
859                     if( !Double.isNaN(sxx) ){
860                         final double[] vcv = new double[]{ mse / sxx };
861                         final double[] params = new double[]{ sxy/sxx };
862                         return new RegressionResults(
863                                     params, new double[][]{vcv}, true, n, 1,
864                                     sumY, syy, sse,false,false);
865                     }else{
866                         final double[] vcv = new double[]{Double.NaN };
867                         final double[] params = new double[]{ Double.NaN };
868                         return new RegressionResults(
869                                     params, new double[][]{vcv}, true, n, 1,
870                                     Double.NaN, Double.NaN, Double.NaN,false,false);
871                     }
872                 }
873             }
874         }else{
875             if( variablesToInclude[0] != 0 ){
876                 throw new OutOfRangeException(variablesToInclude[0],0,0);
877             }
878             return regress();
879         }
880 
881         return null;
882     }
883 }