001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.math.stat.regression;
019 import java.io.Serializable;
020
021 import org.apache.commons.math.MathException;
022 import org.apache.commons.math.exception.OutOfRangeException;
023 import org.apache.commons.math.distribution.TDistribution;
024 import org.apache.commons.math.distribution.TDistributionImpl;
025 import org.apache.commons.math.exception.MathIllegalArgumentException;
026 import org.apache.commons.math.exception.NoDataException;
027 import org.apache.commons.math.exception.util.LocalizedFormats;
028 import org.apache.commons.math.util.FastMath;
029 import org.apache.commons.math.util.MathUtils;
030
031 /**
032 * Estimates an ordinary least squares regression model
033 * with one independent variable.
034 * <p>
035 * <code> y = intercept + slope * x </code></p>
036 * <p>
037 * Standard errors for <code>intercept</code> and <code>slope</code> are
038 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
039 * <p>
040 * Observations (x,y pairs) can be added to the model one at a time or they
041 * can be provided in a 2-dimensional array. The observations are not stored
042 * in memory, so there is no limit to the number of observations that can be
043 * added to the model.</p>
044 * <p>
045 * <strong>Usage Notes</strong>: <ul>
046 * <li> When there are fewer than two observations in the model, or when
047 * there is no variation in the x values (i.e. all x values are the same)
048 * all statistics return <code>NaN</code>. At least two observations with
049 * different x coordinates are required to estimate a bivariate regression
050 * model.
051 * </li>
052 * <li> getters for the statistics always compute values based on the current
053 * set of observations -- i.e., you can get statistics, then add more data
054 * and get updated statistics without using a new instance. There is no
055 * "compute" method that updates all statistics. Each of the getters performs
056 * the necessary computations to return the requested statistic.</li>
057 * </ul></p>
058 *
059 * @version $Id: SimpleRegression.java 1174509 2011-09-23 03:36:11Z gregs $
060 */
061 public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
062
063 /** Serializable version identifier */
064 private static final long serialVersionUID = -3004689053607543335L;
065
066 /** sum of x values */
067 private double sumX = 0d;
068
069 /** total variation in x (sum of squared deviations from xbar) */
070 private double sumXX = 0d;
071
072 /** sum of y values */
073 private double sumY = 0d;
074
075 /** total variation in y (sum of squared deviations from ybar) */
076 private double sumYY = 0d;
077
078 /** sum of products */
079 private double sumXY = 0d;
080
081 /** number of observations */
082 private long n = 0;
083
084 /** mean of accumulated x values, used in updating formulas */
085 private double xbar = 0;
086
087 /** mean of accumulated y values, used in updating formulas */
088 private double ybar = 0;
089
090 /** include an intercept or not */
091 private final boolean hasIntercept;
092 // ---------------------Public methods--------------------------------------
093
094 /**
095 * Create an empty SimpleRegression instance
096 */
097 public SimpleRegression() {
098 this(true);
099 }
100 /**
101 * Secondary constructor which allows the user the ability to include/exclude const
102 * @param includeIntercept boolean flag, true includes an intercept
103 */
104 public SimpleRegression(boolean includeIntercept) {
105 super();
106 hasIntercept = includeIntercept;
107 }
108
109 /**
110 * Adds the observation (x,y) to the regression data set.
111 * <p>
112 * Uses updating formulas for means and sums of squares defined in
113 * "Algorithms for Computing the Sample Variance: Analysis and
114 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
115 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
116 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
117 *
118 *
119 * @param x independent variable value
120 * @param y dependent variable value
121 */
122 public void addData(final double x,final double y) {
123 if (n == 0) {
124 xbar = x;
125 ybar = y;
126 } else {
127 if( hasIntercept ){
128 final double fact1 = 1.0 + (double) n;
129 final double fact2 = ((double) n) / (1.0 + (double) n);
130 final double dx = x - xbar;
131 final double dy = y - ybar;
132 sumXX += dx * dx * fact2;
133 sumYY += dy * dy * fact2;
134 sumXY += dx * dy * fact2;
135 xbar += dx / fact1;
136 ybar += dy / fact1;
137 }
138 }
139 if( !hasIntercept ){
140 sumXX += x * x ;
141 sumYY += y * y ;
142 sumXY += x * y ;
143 }
144 sumX += x;
145 sumY += y;
146 n++;
147 return;
148 }
149
150
151 /**
152 * Removes the observation (x,y) from the regression data set.
153 * <p>
154 * Mirrors the addData method. This method permits the use of
155 * SimpleRegression instances in streaming mode where the regression
156 * is applied to a sliding "window" of observations, however the caller is
157 * responsible for maintaining the set of observations in the window.</p>
158 *
159 * The method has no effect if there are no points of data (i.e. n=0)
160 *
161 * @param x independent variable value
162 * @param y dependent variable value
163 */
164 public void removeData(final double x,final double y) {
165 if (n > 0) {
166 if (hasIntercept) {
167 final double fact1 = (double) n - 1.0;
168 final double fact2 = ((double) n) / ((double) n - 1.0);
169 final double dx = x - xbar;
170 final double dy = y - ybar;
171 sumXX -= dx * dx * fact2;
172 sumYY -= dy * dy * fact2;
173 sumXY -= dx * dy * fact2;
174 xbar -= dx / fact1;
175 ybar -= dy / fact1;
176 } else {
177 final double fact1 = (double) n - 1.0;
178 sumXX -= x * x;
179 sumYY -= y * y;
180 sumXY -= x * y;
181 xbar -= x / fact1;
182 ybar -= y / fact1;
183 }
184 sumX -= x;
185 sumY -= y;
186 n--;
187 }
188 return;
189 }
190
191 /**
192 * Adds the observations represented by the elements in
193 * <code>data</code>.
194 * <p>
195 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
196 * <code>(data[1][0],data[1][1])</code>, etc.</p>
197 * <p>
198 * This method does not replace data that has already been added. The
199 * observations represented by <code>data</code> are added to the existing
200 * dataset.</p>
201 * <p>
202 * To replace all data, use <code>clear()</code> before adding the new
203 * data.</p>
204 *
205 * @param data array of observations to be added
206 * @throws ModelSpecificationException if the length of {@code data[i]} is not
207 * greater than or equal to 2
208 */
209 public void addData(final double[][] data) {
210 for (int i = 0; i < data.length; i++) {
211 if( data[i].length < 2 ){
212 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
213 data[i].length, 2);
214 }
215 addData(data[i][0], data[i][1]);
216 }
217 return;
218 }
219
220 /**
221 * Adds one observation to the regression model.
222 *
223 * @param x the independent variables which form the design matrix
224 * @param y the dependent or response variable
225 * @throws ModelSpecificationException if the length of {@code x} does not equal
226 * the number of independent variables in the model
227 */
228 public void addObservation(final double[] x,final double y) throws ModelSpecificationException{
229 if( x == null || x.length == 0 ){
230 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
231 }
232 addData( x[0], y );
233 return;
234 }
235
236 /**
237 * Adds a series of observations to the regression model. The lengths of
238 * x and y must be the same and x must be rectangular.
239 *
240 * @param x a series of observations on the independent variables
241 * @param y a series of observations on the dependent variable
242 * The length of x and y must be the same
243 * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
244 * the length of {@code y} or does not contain sufficient data to estimate the model
245 */
246 public void addObservations(final double[][] x,final double[] y) {
247 if ((x == null) || (y == null) || (x.length != y.length)) {
248 throw new ModelSpecificationException(
249 LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
250 (x == null) ? 0 : x.length,
251 (y == null) ? 0 : y.length);
252 }
253 boolean obsOk=true;
254 for( int i = 0 ; i < x.length; i++){
255 if( x[i] == null || x[i].length == 0 ){
256 obsOk = false;
257 }
258 }
259 if( !obsOk ){
260 throw new ModelSpecificationException(
261 LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
262 0, 1);
263 }
264 for( int i = 0 ; i < x.length ; i++){
265 addData( x[i][0], y[i] );
266 }
267 return;
268 }
269
270 /**
271 * Removes observations represented by the elements in <code>data</code>.
272 * <p>
273 * If the array is larger than the current n, only the first n elements are
274 * processed. This method permits the use of SimpleRegression instances in
275 * streaming mode where the regression is applied to a sliding "window" of
276 * observations, however the caller is responsible for maintaining the set
277 * of observations in the window.</p>
278 * <p>
279 * To remove all data, use <code>clear()</code>.</p>
280 *
281 * @param data array of observations to be removed
282 */
283 public void removeData(double[][] data) {
284 for (int i = 0; i < data.length && n > 0; i++) {
285 removeData(data[i][0], data[i][1]);
286 }
287 }
288
289 /**
290 * Clears all data from the model.
291 */
292 public void clear() {
293 sumX = 0d;
294 sumXX = 0d;
295 sumY = 0d;
296 sumYY = 0d;
297 sumXY = 0d;
298 n = 0;
299 }
300
301 /**
302 * Returns the number of observations that have been added to the model.
303 *
304 * @return n number of observations that have been added.
305 */
306 public long getN() {
307 return n;
308 }
309
310 /**
311 * Returns the "predicted" <code>y</code> value associated with the
312 * supplied <code>x</code> value, based on the data that has been
313 * added to the model when this method is activated.
314 * <p>
315 * <code> predict(x) = intercept + slope * x </code></p>
316 * <p>
317 * <strong>Preconditions</strong>: <ul>
318 * <li>At least two observations (with at least two different x values)
319 * must have been added before invoking this method. If this method is
320 * invoked before a model can be estimated, <code>Double,NaN</code> is
321 * returned.
322 * </li></ul></p>
323 *
324 * @param x input <code>x</code> value
325 * @return predicted <code>y</code> value
326 */
327 public double predict(final double x) {
328 final double b1 = getSlope();
329 if (hasIntercept) {
330 return getIntercept(b1) + b1 * x;
331 }
332 return b1 * x;
333 }
334
335 /**
336 * Returns the intercept of the estimated regression line.
337 * <p>
338 * The least squares estimate of the intercept is computed using the
339 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
340 * The intercept is sometimes denoted b0.</p>
341 * <p>
342 * <strong>Preconditions</strong>: <ul>
343 * <li>At least two observations (with at least two different x values)
344 * must have been added before invoking this method. If this method is
345 * invoked before a model can be estimated, <code>Double,NaN</code> is
346 * returned.
347 * </li></ul></p>
348 *
349 * @return the intercept of the regression line
350 */
351 public double getIntercept() {
352 return hasIntercept ? getIntercept(getSlope()) : 0.0;
353 }
354
355 /**
356 * Returns true if a constant has been included false otherwise.
357 *
358 * @return true if constant exists, false otherwise
359 */
360 public boolean hasIntercept() {
361 return hasIntercept;
362 }
363
364 /**
365 * Returns the slope of the estimated regression line.
366 * <p>
367 * The least squares estimate of the slope is computed using the
368 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
369 * The slope is sometimes denoted b1.</p>
370 * <p>
371 * <strong>Preconditions</strong>: <ul>
372 * <li>At least two observations (with at least two different x values)
373 * must have been added before invoking this method. If this method is
374 * invoked before a model can be estimated, <code>Double.NaN</code> is
375 * returned.
376 * </li></ul></p>
377 *
378 * @return the slope of the regression line
379 */
380 public double getSlope() {
381 if (n < 2) {
382 return Double.NaN; //not enough data
383 }
384 if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
385 return Double.NaN; //not enough variation in x
386 }
387 return sumXY / sumXX;
388 }
389
390 /**
391 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
392 * sum of squared errors</a> (SSE) associated with the regression
393 * model.
394 * <p>
395 * The sum is computed using the computational formula</p>
396 * <p>
397 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
398 * <p>
399 * where <code>SYY</code> is the sum of the squared deviations of the y
400 * values about their mean, <code>SXX</code> is similarly defined and
401 * <code>SXY</code> is the sum of the products of x and y mean deviations.
402 * </p><p>
403 * The sums are accumulated using the updating algorithm referenced in
404 * {@link #addData}.</p>
405 * <p>
406 * The return value is constrained to be non-negative - i.e., if due to
407 * rounding errors the computational formula returns a negative result,
408 * 0 is returned.</p>
409 * <p>
410 * <strong>Preconditions</strong>: <ul>
411 * <li>At least two observations (with at least two different x values)
412 * must have been added before invoking this method. If this method is
413 * invoked before a model can be estimated, <code>Double,NaN</code> is
414 * returned.
415 * </li></ul></p>
416 *
417 * @return sum of squared errors associated with the regression model
418 */
419 public double getSumSquaredErrors() {
420 return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
421 }
422
423 /**
424 * Returns the sum of squared deviations of the y values about their mean.
425 * <p>
426 * This is defined as SSTO
427 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
428 * <p>
429 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
430 *
431 * @return sum of squared deviations of y values
432 */
433 public double getTotalSumSquares() {
434 if (n < 2) {
435 return Double.NaN;
436 }
437 return sumYY;
438 }
439
440 /**
441 * Returns the sum of squared deviations of the x values about their mean.
442 *
443 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
444 *
445 * @return sum of squared deviations of x values
446 */
447 public double getXSumSquares() {
448 if (n < 2) {
449 return Double.NaN;
450 }
451 return sumXX;
452 }
453
454 /**
455 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
456 *
457 * @return sum of cross products
458 */
459 public double getSumOfCrossProducts() {
460 return sumXY;
461 }
462
463 /**
464 * Returns the sum of squared deviations of the predicted y values about
465 * their mean (which equals the mean of y).
466 * <p>
467 * This is usually abbreviated SSR or SSM. It is defined as SSM
468 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
469 * <p>
470 * <strong>Preconditions</strong>: <ul>
471 * <li>At least two observations (with at least two different x values)
472 * must have been added before invoking this method. If this method is
473 * invoked before a model can be estimated, <code>Double.NaN</code> is
474 * returned.
475 * </li></ul></p>
476 *
477 * @return sum of squared deviations of predicted y values
478 */
479 public double getRegressionSumSquares() {
480 return getRegressionSumSquares(getSlope());
481 }
482
483 /**
484 * Returns the sum of squared errors divided by the degrees of freedom,
485 * usually abbreviated MSE.
486 * <p>
487 * If there are fewer than <strong>three</strong> data pairs in the model,
488 * or if there is no variation in <code>x</code>, this returns
489 * <code>Double.NaN</code>.</p>
490 *
491 * @return sum of squared deviations of y values
492 */
493 public double getMeanSquareError() {
494 if (n < 3) {
495 return Double.NaN;
496 }
497 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
498 }
499
500 /**
501 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
502 * Pearson's product moment correlation coefficient</a>,
503 * usually denoted r.
504 * <p>
505 * <strong>Preconditions</strong>: <ul>
506 * <li>At least two observations (with at least two different x values)
507 * must have been added before invoking this method. If this method is
508 * invoked before a model can be estimated, <code>Double,NaN</code> is
509 * returned.
510 * </li></ul></p>
511 *
512 * @return Pearson's r
513 */
514 public double getR() {
515 double b1 = getSlope();
516 double result = FastMath.sqrt(getRSquare());
517 if (b1 < 0) {
518 result = -result;
519 }
520 return result;
521 }
522
523 /**
524 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
525 * coefficient of determination</a>,
526 * usually denoted r-square.
527 * <p>
528 * <strong>Preconditions</strong>: <ul>
529 * <li>At least two observations (with at least two different x values)
530 * must have been added before invoking this method. If this method is
531 * invoked before a model can be estimated, <code>Double,NaN</code> is
532 * returned.
533 * </li></ul></p>
534 *
535 * @return r-square
536 */
537 public double getRSquare() {
538 double ssto = getTotalSumSquares();
539 return (ssto - getSumSquaredErrors()) / ssto;
540 }
541
542 /**
543 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
544 * standard error of the intercept estimate</a>,
545 * usually denoted s(b0).
546 * <p>
547 * If there are fewer that <strong>three</strong> observations in the
548 * model, or if there is no variation in x, this returns
549 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
550 * returned when the intercept is constrained to be zero
551 *
552 * @return standard error associated with intercept estimate
553 */
554 public double getInterceptStdErr() {
555 if( !hasIntercept ){
556 return Double.NaN;
557 }
558 return FastMath.sqrt(
559 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
560 }
561
562 /**
563 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
564 * error of the slope estimate</a>,
565 * usually denoted s(b1).
566 * <p>
567 * If there are fewer that <strong>three</strong> data pairs in the model,
568 * or if there is no variation in x, this returns <code>Double.NaN</code>.
569 * </p>
570 *
571 * @return standard error associated with slope estimate
572 */
573 public double getSlopeStdErr() {
574 return FastMath.sqrt(getMeanSquareError() / sumXX);
575 }
576
577 /**
578 * Returns the half-width of a 95% confidence interval for the slope
579 * estimate.
580 * <p>
581 * The 95% confidence interval is</p>
582 * <p>
583 * <code>(getSlope() - getSlopeConfidenceInterval(),
584 * getSlope() + getSlopeConfidenceInterval())</code></p>
585 * <p>
586 * If there are fewer that <strong>three</strong> observations in the
587 * model, or if there is no variation in x, this returns
588 * <code>Double.NaN</code>.</p>
589 * <p>
590 * <strong>Usage Note</strong>:<br>
591 * The validity of this statistic depends on the assumption that the
592 * observations included in the model are drawn from a
593 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
594 * Bivariate Normal Distribution</a>.</p>
595 *
596 * @return half-width of 95% confidence interval for the slope estimate
597 * @throws MathException if the confidence interval can not be computed.
598 */
599 public double getSlopeConfidenceInterval() throws MathException {
600 return getSlopeConfidenceInterval(0.05d);
601 }
602
603 /**
604 * Returns the half-width of a (100-100*alpha)% confidence interval for
605 * the slope estimate.
606 * <p>
607 * The (100-100*alpha)% confidence interval is </p>
608 * <p>
609 * <code>(getSlope() - getSlopeConfidenceInterval(),
610 * getSlope() + getSlopeConfidenceInterval())</code></p>
611 * <p>
612 * To request, for example, a 99% confidence interval, use
613 * <code>alpha = .01</code></p>
614 * <p>
615 * <strong>Usage Note</strong>:<br>
616 * The validity of this statistic depends on the assumption that the
617 * observations included in the model are drawn from a
618 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
619 * Bivariate Normal Distribution</a>.</p>
620 * <p>
621 * <strong> Preconditions:</strong><ul>
622 * <li>If there are fewer that <strong>three</strong> observations in the
623 * model, or if there is no variation in x, this returns
624 * <code>Double.NaN</code>.
625 * </li>
626 * <li><code>(0 < alpha < 1)</code>; otherwise an
627 * <code>IllegalArgumentException</code> is thrown.
628 * </li></ul></p>
629 *
630 * @param alpha the desired significance level
631 * @return half-width of 95% confidence interval for the slope estimate
632 * @throws MathException if the confidence interval can not be computed.
633 */
634 public double getSlopeConfidenceInterval(final double alpha)
635 throws MathException {
636 if (alpha >= 1 || alpha <= 0) {
637 throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
638 alpha, 0, 1);
639 }
640 TDistribution distribution = new TDistributionImpl(n - 2);
641 return getSlopeStdErr() *
642 distribution.inverseCumulativeProbability(1d - alpha / 2d);
643 }
644
645 /**
646 * Returns the significance level of the slope (equiv) correlation.
647 * <p>
648 * Specifically, the returned value is the smallest <code>alpha</code>
649 * such that the slope confidence interval with significance level
650 * equal to <code>alpha</code> does not include <code>0</code>.
651 * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
652 * </p><p>
653 * <strong>Usage Note</strong>:<br>
654 * The validity of this statistic depends on the assumption that the
655 * observations included in the model are drawn from a
656 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
657 * Bivariate Normal Distribution</a>.</p>
658 * <p>
659 * If there are fewer that <strong>three</strong> observations in the
660 * model, or if there is no variation in x, this returns
661 * <code>Double.NaN</code>.</p>
662 *
663 * @return significance level for slope/correlation
664 * @throws MathException if the significance level can not be computed.
665 */
666 public double getSignificance() throws MathException {
667 TDistribution distribution = new TDistributionImpl(n - 2);
668 return 2d * (1.0 - distribution.cumulativeProbability(
669 FastMath.abs(getSlope()) / getSlopeStdErr()));
670 }
671
672 // ---------------------Private methods-----------------------------------
673
674 /**
675 * Returns the intercept of the estimated regression line, given the slope.
676 * <p>
677 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
678 *
679 * @param slope current slope
680 * @return the intercept of the regression line
681 */
682 private double getIntercept(final double slope) {
683 if( hasIntercept){
684 return (sumY - slope * sumX) / n;
685 }
686 return 0.0;
687 }
688
689 /**
690 * Computes SSR from b1.
691 *
692 * @param slope regression slope estimate
693 * @return sum of squared deviations of predicted y values
694 */
695 private double getRegressionSumSquares(final double slope) {
696 return slope * slope * sumXX;
697 }
698
699 /**
700 * Performs a regression on data present in buffers and outputs a RegressionResults object
701 * @return RegressionResults acts as a container of regression output
702 * @throws ModelSpecificationException if the model is not correctly specified
703 */
704 public RegressionResults regress() throws ModelSpecificationException{
705 if( hasIntercept ){
706 if( n < 3 ){
707 throw new NoDataException( LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION );
708 }
709 if( FastMath.abs( sumXX ) > MathUtils.SAFE_MIN ){
710 final double[] params = new double[]{ getIntercept(), getSlope() };
711 final double mse = getMeanSquareError();
712 final double _syy = sumYY + sumY * sumY / ((double) n);
713 final double[] vcv = new double[]{
714 mse * (xbar *xbar /sumXX + 1.0 / ((double) n)),
715 -xbar*mse/sumXX,
716 mse/sumXX };
717 return new RegressionResults(
718 params, new double[][]{vcv}, true, n, 2,
719 sumY, _syy, getSumSquaredErrors(),true,false);
720 }else{
721 final double[] params = new double[]{ sumY/((double) n), Double.NaN };
722 //final double mse = getMeanSquareError();
723 final double[] vcv = new double[]{
724 ybar / ((double) n - 1.0),
725 Double.NaN,
726 Double.NaN };
727 return new RegressionResults(
728 params, new double[][]{vcv}, true, n, 1,
729 sumY, sumYY, getSumSquaredErrors(),true,false);
730 }
731 }else{
732 if( n < 2 ){
733 throw new NoDataException( LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION );
734 }
735 if( !Double.isNaN(sumXX) ){
736 final double[] vcv = new double[]{ getMeanSquareError() / sumXX };
737 final double[] params = new double[]{ sumXY/sumXX };
738 return new RegressionResults(
739 params, new double[][]{vcv}, true, n, 1,
740 sumY, sumYY, getSumSquaredErrors(),false,false);
741 }else{
742 final double[] vcv = new double[]{Double.NaN };
743 final double[] params = new double[]{ Double.NaN };
744 return new RegressionResults(
745 params, new double[][]{vcv}, true, n, 1,
746 Double.NaN, Double.NaN, Double.NaN,false,false);
747 }
748 }
749 }
750
751 /**
752 * Performs a regression on data present in buffers including only regressors
753 * indexed in variablesToInclude and outputs a RegressionResults object
754 * @param variablesToInclude an array of indices of regressors to include
755 * @return RegressionResults acts as a container of regression output
756 * @throws ModelSpecificationException if the model is not correctly specified
757 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
758 * @throws OutOfRangeException if a requested variable is not present in model
759 */
760 public RegressionResults regress(int[] variablesToInclude) throws ModelSpecificationException{
761 if( variablesToInclude == null || variablesToInclude.length == 0){
762 throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOTALLOWED);
763 }
764 if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
765 throw new ModelSpecificationException(
766 LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
767 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
768 }
769
770 if( hasIntercept ){
771 if( variablesToInclude.length == 2 ){
772 if( variablesToInclude[0] == 1 ){
773 throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
774 }else if( variablesToInclude[0] != 0 ){
775 throw new OutOfRangeException( variablesToInclude[0], 0,1 );
776 }
777 if( variablesToInclude[1] != 1){
778 throw new OutOfRangeException( variablesToInclude[0], 0,1 );
779 }
780 return regress();
781 }else{
782 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
783 throw new OutOfRangeException( variablesToInclude[0],0,1 );
784 }
785 final double _mean = sumY * sumY / ((double) n);
786 final double _syy = sumYY + _mean;
787 if( variablesToInclude[0] == 0 ){
788 //just the mean
789 final double[] vcv = new double[]{ sumYY/((double)((n-1)*n)) };
790 final double[] params = new double[]{ ybar };
791 return new RegressionResults(
792 params, new double[][]{vcv}, true, n, 1,
793 sumY, _syy+_mean, sumYY,true,false);
794
795 }else if( variablesToInclude[0] == 1){
796 //final double _syy = sumYY + sumY * sumY / ((double) n);
797 final double _sxx = sumXX + sumX * sumX / ((double) n);
798 final double _sxy = sumXY + sumX * sumY / ((double) n);
799 final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
800 final double _mse = _sse/((double)(n-1));
801 if( !Double.isNaN(_sxx) ){
802 final double[] vcv = new double[]{ _mse / _sxx };
803 final double[] params = new double[]{ _sxy/_sxx };
804 return new RegressionResults(
805 params, new double[][]{vcv}, true, n, 1,
806 sumY, _syy, _sse,false,false);
807 }else{
808 final double[] vcv = new double[]{Double.NaN };
809 final double[] params = new double[]{ Double.NaN };
810 return new RegressionResults(
811 params, new double[][]{vcv}, true, n, 1,
812 Double.NaN, Double.NaN, Double.NaN,false,false);
813 }
814 }
815 }
816 }else{
817 if( variablesToInclude[0] != 0 ){
818 throw new OutOfRangeException(variablesToInclude[0],0,0);
819 }
820 return regress();
821 }
822
823 return null;
824 }
825 }