1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math4.legacy.stat.regression;
19
20 import org.apache.commons.statistics.distribution.TDistribution;
21 import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
22 import org.apache.commons.math4.legacy.exception.NoDataException;
23 import org.apache.commons.math4.legacy.exception.OutOfRangeException;
24 import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
25 import org.apache.commons.math4.core.jdkmath.JdkMath;
26 import org.apache.commons.numbers.core.Precision;
27
28 /**
29 * Estimates an ordinary least squares regression model
30 * with one independent variable.
31 * <p>
32 * <code> y = intercept + slope * x </code></p>
33 * <p>
34 * Standard errors for <code>intercept</code> and <code>slope</code> are
35 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
36 * <p>
37 * Observations (x,y pairs) can be added to the model one at a time or they
38 * can be provided in a 2-dimensional array. The observations are not stored
39 * in memory, so there is no limit to the number of observations that can be
40 * added to the model.</p>
41 * <p>
42 * <strong>Usage Notes</strong>: <ul>
43 * <li> When there are fewer than two observations in the model, or when
44 * there is no variation in the x values (i.e. all x values are the same)
45 * all statistics return <code>NaN</code>. At least two observations with
46 * different x coordinates are required to estimate a bivariate regression
47 * model.
48 * </li>
49 * <li> Getters for the statistics always compute values based on the current
50 * set of observations -- i.e., you can get statistics, then add more data
51 * and get updated statistics without using a new instance. There is no
52 * "compute" method that updates all statistics. Each of the getters performs
53 * the necessary computations to return the requested statistic.
54 * </li>
55 * <li> The intercept term may be suppressed by passing {@code false} to
56 * the {@link #SimpleRegression(boolean)} constructor. When the
57 * {@code hasIntercept} property is false, the model is estimated without a
58 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
59 * </ul>
60 *
61 */
62 public class SimpleRegression implements UpdatingMultipleLinearRegression {
63 /** sum of x values. */
64 private double sumX;
65
66 /** total variation in x (sum of squared deviations from xbar). */
67 private double sumXX;
68
69 /** sum of y values. */
70 private double sumY;
71
72 /** total variation in y (sum of squared deviations from ybar). */
73 private double sumYY;
74
75 /** sum of products. */
76 private double sumXY;
77
78 /** number of observations. */
79 private long n;
80
81 /** mean of accumulated x values, used in updating formulas. */
82 private double xbar;
83
84 /** mean of accumulated y values, used in updating formulas. */
85 private double ybar;
86
87 /** include an intercept or not. */
88 private final boolean hasIntercept;
89 // ---------------------Public methods--------------------------------------
90
91 /**
92 * Create an empty SimpleRegression instance.
93 */
94 public SimpleRegression() {
95 this(true);
96 }
97 /**
98 * Create a SimpleRegression instance, specifying whether or not to estimate
99 * an intercept.
100 *
101 * <p>Use {@code false} to estimate a model with no intercept. When the
102 * {@code hasIntercept} property is false, the model is estimated without a
103 * constant term and {@link #getIntercept()} returns {@code 0}.</p>
104 *
105 * @param includeIntercept whether or not to include an intercept term in
106 * the regression model
107 */
108 public SimpleRegression(boolean includeIntercept) {
109 super();
110 hasIntercept = includeIntercept;
111 }
112
113 /**
114 * Adds the observation (x,y) to the regression data set.
115 * <p>
116 * Uses updating formulas for means and sums of squares defined in
117 * "Algorithms for Computing the Sample Variance: Analysis and
118 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
119 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
120 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
121 *
122 *
123 * @param x independent variable value
124 * @param y dependent variable value
125 */
126 public void addData(final double x,final double y) {
127 if (n == 0) {
128 xbar = x;
129 ybar = y;
130 } else {
131 if( hasIntercept ){
132 final double fact1 = 1.0 + n;
133 final double fact2 = n / (1.0 + n);
134 final double dx = x - xbar;
135 final double dy = y - ybar;
136 sumXX += dx * dx * fact2;
137 sumYY += dy * dy * fact2;
138 sumXY += dx * dy * fact2;
139 xbar += dx / fact1;
140 ybar += dy / fact1;
141 }
142 }
143 if( !hasIntercept ){
144 sumXX += x * x ;
145 sumYY += y * y ;
146 sumXY += x * y ;
147 }
148 sumX += x;
149 sumY += y;
150 n++;
151 }
152
153 /**
154 * Appends data from another regression calculation to this one.
155 *
156 * <p>The mean update formulae are based on a paper written by Philippe
157 * Pébay:
158 * <a
159 * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
160 * Formulas for Robust, One-Pass Parallel Computation of Covariances and
161 * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
162 * SAND2008-6212, Sandia National Laboratories.</p>
163 *
164 * @param reg model to append data from
165 * @since 3.3
166 */
167 public void append(SimpleRegression reg) {
168 if (n == 0) {
169 xbar = reg.xbar;
170 ybar = reg.ybar;
171 sumXX = reg.sumXX;
172 sumYY = reg.sumYY;
173 sumXY = reg.sumXY;
174 } else {
175 if (hasIntercept) {
176 final double fact1 = reg.n / (double) (reg.n + n);
177 final double fact2 = n * reg.n / (double) (reg.n + n);
178 final double dx = reg.xbar - xbar;
179 final double dy = reg.ybar - ybar;
180 sumXX += reg.sumXX + dx * dx * fact2;
181 sumYY += reg.sumYY + dy * dy * fact2;
182 sumXY += reg.sumXY + dx * dy * fact2;
183 xbar += dx * fact1;
184 ybar += dy * fact1;
185 }else{
186 sumXX += reg.sumXX;
187 sumYY += reg.sumYY;
188 sumXY += reg.sumXY;
189 }
190 }
191 sumX += reg.sumX;
192 sumY += reg.sumY;
193 n += reg.n;
194 }
195
196 /**
197 * Removes the observation (x,y) from the regression data set.
198 * <p>
199 * Mirrors the addData method. This method permits the use of
200 * SimpleRegression instances in streaming mode where the regression
201 * is applied to a sliding "window" of observations, however the caller is
202 * responsible for maintaining the set of observations in the window.</p>
203 *
204 * The method has no effect if there are no points of data (i.e. n=0)
205 *
206 * @param x independent variable value
207 * @param y dependent variable value
208 */
209 public void removeData(final double x,final double y) {
210 if (n > 0) {
211 if (hasIntercept) {
212 final double fact1 = n - 1.0;
213 final double fact2 = n / (n - 1.0);
214 final double dx = x - xbar;
215 final double dy = y - ybar;
216 sumXX -= dx * dx * fact2;
217 sumYY -= dy * dy * fact2;
218 sumXY -= dx * dy * fact2;
219 xbar -= dx / fact1;
220 ybar -= dy / fact1;
221 } else {
222 final double fact1 = n - 1.0;
223 sumXX -= x * x;
224 sumYY -= y * y;
225 sumXY -= x * y;
226 xbar -= x / fact1;
227 ybar -= y / fact1;
228 }
229 sumX -= x;
230 sumY -= y;
231 n--;
232 }
233 }
234
235 /**
236 * Adds the observations represented by the elements in
237 * <code>data</code>.
238 * <p>
239 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
240 * <code>(data[1][0],data[1][1])</code>, etc.</p>
241 * <p>
242 * This method does not replace data that has already been added. The
243 * observations represented by <code>data</code> are added to the existing
244 * dataset.</p>
245 * <p>
246 * To replace all data, use <code>clear()</code> before adding the new
247 * data.</p>
248 *
249 * @param data array of observations to be added
250 * @throws ModelSpecificationException if the length of {@code data[i]} is not
251 * greater than or equal to 2
252 */
253 public void addData(final double[][] data) throws ModelSpecificationException {
254 for (int i = 0; i < data.length; i++) {
255 if( data[i].length < 2 ){
256 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
257 data[i].length, 2);
258 }
259 addData(data[i][0], data[i][1]);
260 }
261 }
262
263 /**
264 * Adds one observation to the regression model.
265 *
266 * @param x the independent variables which form the design matrix
267 * @param y the dependent or response variable
268 * @throws ModelSpecificationException if the length of {@code x} does not equal
269 * the number of independent variables in the model
270 */
271 @Override
272 public void addObservation(final double[] x,final double y)
273 throws ModelSpecificationException {
274 if( x == null || x.length == 0 ){
275 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
276 }
277 addData( x[0], y );
278 }
279
280 /**
281 * Adds a series of observations to the regression model. The lengths of
282 * x and y must be the same and x must be rectangular.
283 *
284 * @param x a series of observations on the independent variables
285 * @param y a series of observations on the dependent variable
286 * The length of x and y must be the same
287 * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
288 * the length of {@code y} or does not contain sufficient data to estimate the model
289 */
290 @Override
291 public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
292 if (x == null || y == null || x.length != y.length) {
293 throw new ModelSpecificationException(
294 LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
295 (x == null) ? 0 : x.length,
296 (y == null) ? 0 : y.length);
297 }
298 boolean obsOk = true;
299 for( int i = 0 ; i < x.length; i++){
300 if( x[i] == null || x[i].length == 0 ){
301 obsOk = false;
302 break;
303 }
304 }
305 if( !obsOk ){
306 throw new ModelSpecificationException(
307 LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
308 0, 1);
309 }
310 for( int i = 0 ; i < x.length ; i++){
311 addData( x[i][0], y[i] );
312 }
313 }
314
315 /**
316 * Removes observations represented by the elements in <code>data</code>.
317 * <p>
318 * If the array is larger than the current n, only the first n elements are
319 * processed. This method permits the use of SimpleRegression instances in
320 * streaming mode where the regression is applied to a sliding "window" of
321 * observations, however the caller is responsible for maintaining the set
322 * of observations in the window.</p>
323 * <p>
324 * To remove all data, use <code>clear()</code>.</p>
325 *
326 * @param data array of observations to be removed
327 */
328 public void removeData(double[][] data) {
329 for (int i = 0; i < data.length && n > 0; i++) {
330 removeData(data[i][0], data[i][1]);
331 }
332 }
333
334 /**
335 * Clears all data from the model.
336 */
337 @Override
338 public void clear() {
339 sumX = 0d;
340 sumXX = 0d;
341 sumY = 0d;
342 sumYY = 0d;
343 sumXY = 0d;
344 n = 0;
345 }
346
347 /**
348 * Returns the number of observations that have been added to the model.
349 *
350 * @return n number of observations that have been added.
351 */
352 @Override
353 public long getN() {
354 return n;
355 }
356
357 /**
358 * Returns the "predicted" <code>y</code> value associated with the
359 * supplied <code>x</code> value, based on the data that has been
360 * added to the model when this method is activated.
361 * <p>
362 * <code> predict(x) = intercept + slope * x </code></p>
363 * <p>
364 * <strong>Preconditions</strong>: <ul>
365 * <li>At least two observations (with at least two different x values)
366 * must have been added before invoking this method. If this method is
367 * invoked before a model can be estimated, <code>Double,NaN</code> is
368 * returned.
369 * </li></ul>
370 *
371 * @param x input <code>x</code> value
372 * @return predicted <code>y</code> value
373 */
374 public double predict(final double x) {
375 final double b1 = getSlope();
376 if (hasIntercept) {
377 return getIntercept(b1) + b1 * x;
378 }
379 return b1 * x;
380 }
381
382 /**
383 * Returns the intercept of the estimated regression line, if
384 * {@link #hasIntercept()} is true; otherwise 0.
385 * <p>
386 * The least squares estimate of the intercept is computed using the
387 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
388 * The intercept is sometimes denoted b0.</p>
389 * <p>
390 * <strong>Preconditions</strong>: <ul>
391 * <li>At least two observations (with at least two different x values)
392 * must have been added before invoking this method. If this method is
393 * invoked before a model can be estimated, <code>Double,NaN</code> is
394 * returned.
395 * </li></ul>
396 *
397 * @return the intercept of the regression line if the model includes an
398 * intercept; 0 otherwise
399 * @see #SimpleRegression(boolean)
400 */
401 public double getIntercept() {
402 return hasIntercept ? getIntercept(getSlope()) : 0.0;
403 }
404
405 /**
406 * Returns true if the model includes an intercept term.
407 *
408 * @return true if the regression includes an intercept; false otherwise
409 * @see #SimpleRegression(boolean)
410 */
411 @Override
412 public boolean hasIntercept() {
413 return hasIntercept;
414 }
415
416 /**
417 * Returns the slope of the estimated regression line.
418 * <p>
419 * The least squares estimate of the slope is computed using the
420 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
421 * The slope is sometimes denoted b1.</p>
422 * <p>
423 * <strong>Preconditions</strong>: <ul>
424 * <li>At least two observations (with at least two different x values)
425 * must have been added before invoking this method. If this method is
426 * invoked before a model can be estimated, <code>Double.NaN</code> is
427 * returned.
428 * </li></ul>
429 *
430 * @return the slope of the regression line
431 */
432 public double getSlope() {
433 if (n < 2) {
434 return Double.NaN; //not enough data
435 }
436 if (JdkMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
437 return Double.NaN; //not enough variation in x
438 }
439 return sumXY / sumXX;
440 }
441
442 /**
443 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
444 * sum of squared errors</a> (SSE) associated with the regression
445 * model.
446 * <p>
447 * The sum is computed using the computational formula</p>
448 * <p>
449 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
450 * <p>
451 * where <code>SYY</code> is the sum of the squared deviations of the y
452 * values about their mean, <code>SXX</code> is similarly defined and
453 * <code>SXY</code> is the sum of the products of x and y mean deviations.
454 * </p><p>
455 * The sums are accumulated using the updating algorithm referenced in
456 * {@link #addData}.</p>
457 * <p>
458 * The return value is constrained to be non-negative - i.e., if due to
459 * rounding errors the computational formula returns a negative result,
460 * 0 is returned.</p>
461 * <p>
462 * <strong>Preconditions</strong>: <ul>
463 * <li>At least two observations (with at least two different x values)
464 * must have been added before invoking this method. If this method is
465 * invoked before a model can be estimated, <code>Double,NaN</code> is
466 * returned.
467 * </li></ul>
468 *
469 * @return sum of squared errors associated with the regression model
470 */
471 public double getSumSquaredErrors() {
472 return JdkMath.max(0d, sumYY - sumXY * sumXY / sumXX);
473 }
474
475 /**
476 * Returns the sum of squared deviations of the y values about their mean.
477 * <p>
478 * This is defined as SSTO
479 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
480 * <p>
481 * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
482 *
483 * @return sum of squared deviations of y values
484 */
485 public double getTotalSumSquares() {
486 if (n < 2) {
487 return Double.NaN;
488 }
489 return sumYY;
490 }
491
492 /**
493 * Returns the sum of squared deviations of the x values about their mean.
494 *
495 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
496 *
497 * @return sum of squared deviations of x values
498 */
499 public double getXSumSquares() {
500 if (n < 2) {
501 return Double.NaN;
502 }
503 return sumXX;
504 }
505
506 /**
507 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
508 *
509 * @return sum of cross products
510 */
511 public double getSumOfCrossProducts() {
512 return sumXY;
513 }
514
515 /**
516 * Returns the sum of squared deviations of the predicted y values about
517 * their mean (which equals the mean of y).
518 * <p>
519 * This is usually abbreviated SSR or SSM. It is defined as SSM
520 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
521 * <p>
522 * <strong>Preconditions</strong>: <ul>
523 * <li>At least two observations (with at least two different x values)
524 * must have been added before invoking this method. If this method is
525 * invoked before a model can be estimated, <code>Double.NaN</code> is
526 * returned.
527 * </li></ul>
528 *
529 * @return sum of squared deviations of predicted y values
530 */
531 public double getRegressionSumSquares() {
532 return getRegressionSumSquares(getSlope());
533 }
534
535 /**
536 * Returns the sum of squared errors divided by the degrees of freedom,
537 * usually abbreviated MSE.
538 * <p>
539 * If there are fewer than <strong>three</strong> data pairs in the model,
540 * or if there is no variation in <code>x</code>, this returns
541 * <code>Double.NaN</code>.</p>
542 *
543 * @return sum of squared deviations of y values
544 */
545 public double getMeanSquareError() {
546 if (n < 3) {
547 return Double.NaN;
548 }
549 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
550 }
551
552 /**
553 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
554 * Pearson's product moment correlation coefficient</a>,
555 * usually denoted r.
556 * <p>
557 * <strong>Preconditions</strong>: <ul>
558 * <li>At least two observations (with at least two different x values)
559 * must have been added before invoking this method. If this method is
560 * invoked before a model can be estimated, <code>Double,NaN</code> is
561 * returned.
562 * </li></ul>
563 *
564 * @return Pearson's r
565 */
566 public double getR() {
567 double b1 = getSlope();
568 double result = JdkMath.sqrt(getRSquare());
569 if (b1 < 0) {
570 result = -result;
571 }
572 return result;
573 }
574
575 /**
576 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
577 * coefficient of determination</a>,
578 * usually denoted r-square.
579 * <p>
580 * <strong>Preconditions</strong>: <ul>
581 * <li>At least two observations (with at least two different x values)
582 * must have been added before invoking this method. If this method is
583 * invoked before a model can be estimated, <code>Double,NaN</code> is
584 * returned.
585 * </li></ul>
586 *
587 * @return r-square
588 */
589 public double getRSquare() {
590 double ssto = getTotalSumSquares();
591 return (ssto - getSumSquaredErrors()) / ssto;
592 }
593
594 /**
595 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
596 * standard error of the intercept estimate</a>,
597 * usually denoted s(b0).
598 * <p>
599 * If there are fewer that <strong>three</strong> observations in the
600 * model, or if there is no variation in x, this returns
601 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
602 * returned when the intercept is constrained to be zero
603 *
604 * @return standard error associated with intercept estimate
605 */
606 public double getInterceptStdErr() {
607 if( !hasIntercept ){
608 return Double.NaN;
609 }
610 return JdkMath.sqrt(
611 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
612 }
613
614 /**
615 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
616 * error of the slope estimate</a>,
617 * usually denoted s(b1).
618 * <p>
619 * If there are fewer that <strong>three</strong> data pairs in the model,
620 * or if there is no variation in x, this returns <code>Double.NaN</code>.
621 * </p>
622 *
623 * @return standard error associated with slope estimate
624 */
625 public double getSlopeStdErr() {
626 return JdkMath.sqrt(getMeanSquareError() / sumXX);
627 }
628
629 /**
630 * Returns the half-width of a 95% confidence interval for the slope
631 * estimate.
632 * <p>
633 * The 95% confidence interval is</p>
634 * <p>
635 * <code>(getSlope() - getSlopeConfidenceInterval(),
636 * getSlope() + getSlopeConfidenceInterval())</code></p>
637 * <p>
638 * If there are fewer that <strong>three</strong> observations in the
639 * model, or if there is no variation in x, this returns
640 * <code>Double.NaN</code>.</p>
641 * <p>
642 * <strong>Usage Note</strong>:<br>
643 * The validity of this statistic depends on the assumption that the
644 * observations included in the model are drawn from a
645 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
646 * Bivariate Normal Distribution</a>.</p>
647 *
648 * @return half-width of 95% confidence interval for the slope estimate
649 * @throws OutOfRangeException if the confidence interval can not be computed.
650 */
651 public double getSlopeConfidenceInterval() throws OutOfRangeException {
652 return getSlopeConfidenceInterval(0.05d);
653 }
654
655 /**
656 * Returns the half-width of a (100-100*alpha)% confidence interval for
657 * the slope estimate.
658 * <p>
659 * The (100-100*alpha)% confidence interval is </p>
660 * <p>
661 * <code>(getSlope() - getSlopeConfidenceInterval(),
662 * getSlope() + getSlopeConfidenceInterval())</code></p>
663 * <p>
664 * To request, for example, a 99% confidence interval, use
665 * <code>alpha = .01</code></p>
666 * <p>
667 * <strong>Usage Note</strong>:<br>
668 * The validity of this statistic depends on the assumption that the
669 * observations included in the model are drawn from a
670 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
671 * Bivariate Normal Distribution</a>.</p>
672 * <p>
673 * <strong> Preconditions:</strong><ul>
674 * <li>If there are fewer that <strong>three</strong> observations in the
675 * model, or if there is no variation in x, this returns
676 * <code>Double.NaN</code>.
677 * </li>
678 * <li>{@code (0 < alpha < 1)}; otherwise an
679 * <code>OutOfRangeException</code> is thrown.
680 * </li></ul>
681 *
682 * @param alpha the desired significance level
683 * @return half-width of 95% confidence interval for the slope estimate
684 * @throws OutOfRangeException if the confidence interval can not be computed.
685 */
686 public double getSlopeConfidenceInterval(final double alpha)
687 throws OutOfRangeException {
688 if (n < 3) {
689 return Double.NaN;
690 }
691 if (alpha >= 1 || alpha <= 0) {
692 throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
693 alpha, 0, 1);
694 }
695 // No advertised NotStrictlyPositiveException here - will return NaN above
696 TDistribution distribution = TDistribution.of(n - 2d);
697 return getSlopeStdErr() *
698 distribution.inverseSurvivalProbability(alpha / 2d);
699 }
700
701 /**
702 * Returns the significance level of the slope (equiv) correlation.
703 * <p>
704 * Specifically, the returned value is the smallest <code>alpha</code>
705 * such that the slope confidence interval with significance level
706 * equal to <code>alpha</code> does not include <code>0</code>.
707 * On regression output, this is often denoted {@code Prob(|t| > 0)}
708 * </p><p>
709 * <strong>Usage Note</strong>:<br>
710 * The validity of this statistic depends on the assumption that the
711 * observations included in the model are drawn from a
712 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
713 * Bivariate Normal Distribution</a>.</p>
714 * <p>
715 * If there are fewer that <strong>three</strong> observations in the
716 * model, or if there is no variation in x, this returns
717 * <code>Double.NaN</code>.</p>
718 *
719 * @return significance level for slope/correlation
720 * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
721 * if the significance level can not be computed.
722 */
723 public double getSignificance() {
724 if (n < 3) {
725 return Double.NaN;
726 }
727 // No advertised NotStrictlyPositiveException here - will return NaN above
728 TDistribution distribution = TDistribution.of(n - 2d);
729 return 2d * (distribution.survivalProbability(
730 JdkMath.abs(getSlope()) / getSlopeStdErr()));
731 }
732
733 // ---------------------Private methods-----------------------------------
734
735 /**
736 * Returns the intercept of the estimated regression line, given the slope.
737 * <p>
738 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
739 *
740 * @param slope current slope
741 * @return the intercept of the regression line
742 */
743 private double getIntercept(final double slope) {
744 if( hasIntercept){
745 return (sumY - slope * sumX) / n;
746 }
747 return 0.0;
748 }
749
750 /**
751 * Computes SSR from b1.
752 *
753 * @param slope regression slope estimate
754 * @return sum of squared deviations of predicted y values
755 */
756 private double getRegressionSumSquares(final double slope) {
757 return slope * slope * sumXX;
758 }
759
760 /**
761 * Performs a regression on data present in buffers and outputs a RegressionResults object.
762 *
763 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
764 * a {@code NoDataException} is thrown. If there is no intercept term, the model must
765 * contain at least 2 observations.</p>
766 *
767 * @return RegressionResults acts as a container of regression output
768 * @throws ModelSpecificationException if the model is not correctly specified
769 * @throws NoDataException if there is not sufficient data in the model to
770 * estimate the regression parameters
771 */
772 @Override
773 public RegressionResults regress() throws ModelSpecificationException, NoDataException {
774 if (hasIntercept) {
775 if (n < 3) {
776 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
777 }
778 if (JdkMath.abs(sumXX) > Precision.SAFE_MIN) {
779 final double[] params = new double[] { getIntercept(), getSlope() };
780 final double mse = getMeanSquareError();
781 final double syy = sumYY + sumY * sumY / n;
782 final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
783 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, syy, getSumSquaredErrors(), true,
784 false);
785 } else {
786 final double[] params = new double[] { sumY / n, Double.NaN };
787 // final double mse = getMeanSquareError();
788 final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN };
789 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
790 false);
791 }
792 } else {
793 if (n < 2) {
794 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
795 }
796 if (!Double.isNaN(sumXX)) {
797 final double[] vcv = new double[] { getMeanSquareError() / sumXX };
798 final double[] params = new double[] { sumXY / sumXX };
799 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
800 false);
801 } else {
802 final double[] vcv = new double[] { Double.NaN };
803 final double[] params = new double[] { Double.NaN };
804 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
805 false);
806 }
807 }
808 }
809
810 /**
811 * Performs a regression on data present in buffers including only regressors.
812 * indexed in variablesToInclude and outputs a RegressionResults object
813 * @param variablesToInclude an array of indices of regressors to include
814 * @return RegressionResults acts as a container of regression output
815 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
816 * @throws OutOfRangeException if a requested variable is not present in model
817 */
818 @Override
819 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
820 if( variablesToInclude == null || variablesToInclude.length == 0){
821 throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
822 }
823 if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
824 throw new ModelSpecificationException(
825 LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
826 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
827 }
828
829 if( hasIntercept ){
830 if( variablesToInclude.length == 2 ){
831 if( variablesToInclude[0] == 1 ){
832 throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
833 }else if( variablesToInclude[0] != 0 ){
834 throw new OutOfRangeException( variablesToInclude[0], 0,1 );
835 }
836 if( variablesToInclude[1] != 1){
837 throw new OutOfRangeException( variablesToInclude[0], 0,1 );
838 }
839 return regress();
840 }else{
841 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
842 throw new OutOfRangeException( variablesToInclude[0],0,1 );
843 }
844 final double mean = sumY * sumY / n;
845 final double syy = sumYY + mean;
846 if( variablesToInclude[0] == 0 ){
847 //just the mean
848 final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
849 final double[] params = new double[]{ ybar };
850 return new RegressionResults(
851 params, new double[][]{vcv}, true, n, 1,
852 sumY, syy+mean, sumYY,true,false);
853 }else if( variablesToInclude[0] == 1){
854 //final double _syy = sumYY + sumY * sumY / ((double) n);
855 final double sxx = sumXX + sumX * sumX / n;
856 final double sxy = sumXY + sumX * sumY / n;
857 final double sse = JdkMath.max(0d, syy - sxy * sxy / sxx);
858 final double mse = sse/((n-1));
859 if( !Double.isNaN(sxx) ){
860 final double[] vcv = new double[]{ mse / sxx };
861 final double[] params = new double[]{ sxy/sxx };
862 return new RegressionResults(
863 params, new double[][]{vcv}, true, n, 1,
864 sumY, syy, sse,false,false);
865 }else{
866 final double[] vcv = new double[]{Double.NaN };
867 final double[] params = new double[]{ Double.NaN };
868 return new RegressionResults(
869 params, new double[][]{vcv}, true, n, 1,
870 Double.NaN, Double.NaN, Double.NaN,false,false);
871 }
872 }
873 }
874 }else{
875 if( variablesToInclude[0] != 0 ){
876 throw new OutOfRangeException(variablesToInclude[0],0,0);
877 }
878 return regress();
879 }
880
881 return null;
882 }
883 }