001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math3.stat.regression; 019import java.io.Serializable; 020 021import org.apache.commons.math3.distribution.TDistribution; 022import org.apache.commons.math3.exception.MathIllegalArgumentException; 023import org.apache.commons.math3.exception.NoDataException; 024import org.apache.commons.math3.exception.OutOfRangeException; 025import org.apache.commons.math3.exception.util.LocalizedFormats; 026import org.apache.commons.math3.util.FastMath; 027import org.apache.commons.math3.util.Precision; 028 029/** 030 * Estimates an ordinary least squares regression model 031 * with one independent variable. 032 * <p> 033 * <code> y = intercept + slope * x </code></p> 034 * <p> 035 * Standard errors for <code>intercept</code> and <code>slope</code> are 036 * available as well as ANOVA, r-square and Pearson's r statistics.</p> 037 * <p> 038 * Observations (x,y pairs) can be added to the model one at a time or they 039 * can be provided in a 2-dimensional array. The observations are not stored 040 * in memory, so there is no limit to the number of observations that can be 041 * added to the model.</p> 042 * <p> 043 * <strong>Usage Notes</strong>: <ul> 044 * <li> When there are fewer than two observations in the model, or when 045 * there is no variation in the x values (i.e. all x values are the same) 046 * all statistics return <code>NaN</code>. At least two observations with 047 * different x coordinates are required to estimate a bivariate regression 048 * model. 049 * </li> 050 * <li> Getters for the statistics always compute values based on the current 051 * set of observations -- i.e., you can get statistics, then add more data 052 * and get updated statistics without using a new instance. There is no 053 * "compute" method that updates all statistics. Each of the getters performs 054 * the necessary computations to return the requested statistic. 055 * </li> 056 * <li> The intercept term may be suppressed by passing {@code false} to 057 * the {@link #SimpleRegression(boolean)} constructor. When the 058 * {@code hasIntercept} property is false, the model is estimated without a 059 * constant term and {@link #getIntercept()} returns {@code 0}.</li> 060 * </ul></p> 061 * 062 */ 063public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression { 064 065 /** Serializable version identifier */ 066 private static final long serialVersionUID = -3004689053607543335L; 067 068 /** sum of x values */ 069 private double sumX = 0d; 070 071 /** total variation in x (sum of squared deviations from xbar) */ 072 private double sumXX = 0d; 073 074 /** sum of y values */ 075 private double sumY = 0d; 076 077 /** total variation in y (sum of squared deviations from ybar) */ 078 private double sumYY = 0d; 079 080 /** sum of products */ 081 private double sumXY = 0d; 082 083 /** number of observations */ 084 private long n = 0; 085 086 /** mean of accumulated x values, used in updating formulas */ 087 private double xbar = 0; 088 089 /** mean of accumulated y values, used in updating formulas */ 090 private double ybar = 0; 091 092 /** include an intercept or not */ 093 private final boolean hasIntercept; 094 // ---------------------Public methods-------------------------------------- 095 096 /** 097 * Create an empty SimpleRegression instance 098 */ 099 public SimpleRegression() { 100 this(true); 101 } 102 /** 103 * Create a SimpleRegression instance, specifying whether or not to estimate 104 * an intercept. 105 * 106 * <p>Use {@code false} to estimate a model with no intercept. When the 107 * {@code hasIntercept} property is false, the model is estimated without a 108 * constant term and {@link #getIntercept()} returns {@code 0}.</p> 109 * 110 * @param includeIntercept whether or not to include an intercept term in 111 * the regression model 112 */ 113 public SimpleRegression(boolean includeIntercept) { 114 super(); 115 hasIntercept = includeIntercept; 116 } 117 118 /** 119 * Adds the observation (x,y) to the regression data set. 120 * <p> 121 * Uses updating formulas for means and sums of squares defined in 122 * "Algorithms for Computing the Sample Variance: Analysis and 123 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 124 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 125 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> 126 * 127 * 128 * @param x independent variable value 129 * @param y dependent variable value 130 */ 131 public void addData(final double x,final double y) { 132 if (n == 0) { 133 xbar = x; 134 ybar = y; 135 } else { 136 if( hasIntercept ){ 137 final double fact1 = 1.0 + n; 138 final double fact2 = n / (1.0 + n); 139 final double dx = x - xbar; 140 final double dy = y - ybar; 141 sumXX += dx * dx * fact2; 142 sumYY += dy * dy * fact2; 143 sumXY += dx * dy * fact2; 144 xbar += dx / fact1; 145 ybar += dy / fact1; 146 } 147 } 148 if( !hasIntercept ){ 149 sumXX += x * x ; 150 sumYY += y * y ; 151 sumXY += x * y ; 152 } 153 sumX += x; 154 sumY += y; 155 n++; 156 } 157 158 /** 159 * Appends data from another regression calculation to this one. 160 * 161 * <p>The mean update formulae are based on a paper written by Philippe 162 * Pébay: 163 * <a 164 * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf"> 165 * Formulas for Robust, One-Pass Parallel Computation of Covariances and 166 * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report 167 * SAND2008-6212, Sandia National Laboratories.</p> 168 * 169 * @param reg model to append data from 170 * @since 3.3 171 */ 172 public void append(SimpleRegression reg) { 173 if (n == 0) { 174 xbar = reg.xbar; 175 ybar = reg.ybar; 176 sumXX = reg.sumXX; 177 sumYY = reg.sumYY; 178 sumXY = reg.sumXY; 179 } else { 180 if (hasIntercept) { 181 final double fact1 = reg.n / (double) (reg.n + n); 182 final double fact2 = n * reg.n / (double) (reg.n + n); 183 final double dx = reg.xbar - xbar; 184 final double dy = reg.ybar - ybar; 185 sumXX += reg.sumXX + dx * dx * fact2; 186 sumYY += reg.sumYY + dy * dy * fact2; 187 sumXY += reg.sumXY + dx * dy * fact2; 188 xbar += dx * fact1; 189 ybar += dy * fact1; 190 }else{ 191 sumXX += reg.sumXX; 192 sumYY += reg.sumYY; 193 sumXY += reg.sumXY; 194 } 195 } 196 sumX += reg.sumX; 197 sumY += reg.sumY; 198 n += reg.n; 199 } 200 201 /** 202 * Removes the observation (x,y) from the regression data set. 203 * <p> 204 * Mirrors the addData method. This method permits the use of 205 * SimpleRegression instances in streaming mode where the regression 206 * is applied to a sliding "window" of observations, however the caller is 207 * responsible for maintaining the set of observations in the window.</p> 208 * 209 * The method has no effect if there are no points of data (i.e. n=0) 210 * 211 * @param x independent variable value 212 * @param y dependent variable value 213 */ 214 public void removeData(final double x,final double y) { 215 if (n > 0) { 216 if (hasIntercept) { 217 final double fact1 = n - 1.0; 218 final double fact2 = n / (n - 1.0); 219 final double dx = x - xbar; 220 final double dy = y - ybar; 221 sumXX -= dx * dx * fact2; 222 sumYY -= dy * dy * fact2; 223 sumXY -= dx * dy * fact2; 224 xbar -= dx / fact1; 225 ybar -= dy / fact1; 226 } else { 227 final double fact1 = n - 1.0; 228 sumXX -= x * x; 229 sumYY -= y * y; 230 sumXY -= x * y; 231 xbar -= x / fact1; 232 ybar -= y / fact1; 233 } 234 sumX -= x; 235 sumY -= y; 236 n--; 237 } 238 } 239 240 /** 241 * Adds the observations represented by the elements in 242 * <code>data</code>. 243 * <p> 244 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 245 * <code>(data[1][0],data[1][1])</code>, etc.</p> 246 * <p> 247 * This method does not replace data that has already been added. The 248 * observations represented by <code>data</code> are added to the existing 249 * dataset.</p> 250 * <p> 251 * To replace all data, use <code>clear()</code> before adding the new 252 * data.</p> 253 * 254 * @param data array of observations to be added 255 * @throws ModelSpecificationException if the length of {@code data[i]} is not 256 * greater than or equal to 2 257 */ 258 public void addData(final double[][] data) throws ModelSpecificationException { 259 for (int i = 0; i < data.length; i++) { 260 if( data[i].length < 2 ){ 261 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION, 262 data[i].length, 2); 263 } 264 addData(data[i][0], data[i][1]); 265 } 266 } 267 268 /** 269 * Adds one observation to the regression model. 270 * 271 * @param x the independent variables which form the design matrix 272 * @param y the dependent or response variable 273 * @throws ModelSpecificationException if the length of {@code x} does not equal 274 * the number of independent variables in the model 275 */ 276 public void addObservation(final double[] x,final double y) 277 throws ModelSpecificationException { 278 if( x == null || x.length == 0 ){ 279 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1); 280 } 281 addData( x[0], y ); 282 } 283 284 /** 285 * Adds a series of observations to the regression model. The lengths of 286 * x and y must be the same and x must be rectangular. 287 * 288 * @param x a series of observations on the independent variables 289 * @param y a series of observations on the dependent variable 290 * The length of x and y must be the same 291 * @throws ModelSpecificationException if {@code x} is not rectangular, does not match 292 * the length of {@code y} or does not contain sufficient data to estimate the model 293 */ 294 public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException { 295 if ((x == null) || (y == null) || (x.length != y.length)) { 296 throw new ModelSpecificationException( 297 LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, 298 (x == null) ? 0 : x.length, 299 (y == null) ? 0 : y.length); 300 } 301 boolean obsOk=true; 302 for( int i = 0 ; i < x.length; i++){ 303 if( x[i] == null || x[i].length == 0 ){ 304 obsOk = false; 305 } 306 } 307 if( !obsOk ){ 308 throw new ModelSpecificationException( 309 LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, 310 0, 1); 311 } 312 for( int i = 0 ; i < x.length ; i++){ 313 addData( x[i][0], y[i] ); 314 } 315 } 316 317 /** 318 * Removes observations represented by the elements in <code>data</code>. 319 * <p> 320 * If the array is larger than the current n, only the first n elements are 321 * processed. This method permits the use of SimpleRegression instances in 322 * streaming mode where the regression is applied to a sliding "window" of 323 * observations, however the caller is responsible for maintaining the set 324 * of observations in the window.</p> 325 * <p> 326 * To remove all data, use <code>clear()</code>.</p> 327 * 328 * @param data array of observations to be removed 329 */ 330 public void removeData(double[][] data) { 331 for (int i = 0; i < data.length && n > 0; i++) { 332 removeData(data[i][0], data[i][1]); 333 } 334 } 335 336 /** 337 * Clears all data from the model. 338 */ 339 public void clear() { 340 sumX = 0d; 341 sumXX = 0d; 342 sumY = 0d; 343 sumYY = 0d; 344 sumXY = 0d; 345 n = 0; 346 } 347 348 /** 349 * Returns the number of observations that have been added to the model. 350 * 351 * @return n number of observations that have been added. 352 */ 353 public long getN() { 354 return n; 355 } 356 357 /** 358 * Returns the "predicted" <code>y</code> value associated with the 359 * supplied <code>x</code> value, based on the data that has been 360 * added to the model when this method is activated. 361 * <p> 362 * <code> predict(x) = intercept + slope * x </code></p> 363 * <p> 364 * <strong>Preconditions</strong>: <ul> 365 * <li>At least two observations (with at least two different x values) 366 * must have been added before invoking this method. If this method is 367 * invoked before a model can be estimated, <code>Double,NaN</code> is 368 * returned. 369 * </li></ul></p> 370 * 371 * @param x input <code>x</code> value 372 * @return predicted <code>y</code> value 373 */ 374 public double predict(final double x) { 375 final double b1 = getSlope(); 376 if (hasIntercept) { 377 return getIntercept(b1) + b1 * x; 378 } 379 return b1 * x; 380 } 381 382 /** 383 * Returns the intercept of the estimated regression line, if 384 * {@link #hasIntercept()} is true; otherwise 0. 385 * <p> 386 * The least squares estimate of the intercept is computed using the 387 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 388 * The intercept is sometimes denoted b0.</p> 389 * <p> 390 * <strong>Preconditions</strong>: <ul> 391 * <li>At least two observations (with at least two different x values) 392 * must have been added before invoking this method. If this method is 393 * invoked before a model can be estimated, <code>Double,NaN</code> is 394 * returned. 395 * </li></ul></p> 396 * 397 * @return the intercept of the regression line if the model includes an 398 * intercept; 0 otherwise 399 * @see #SimpleRegression(boolean) 400 */ 401 public double getIntercept() { 402 return hasIntercept ? getIntercept(getSlope()) : 0.0; 403 } 404 405 /** 406 * Returns true if the model includes an intercept term. 407 * 408 * @return true if the regression includes an intercept; false otherwise 409 * @see #SimpleRegression(boolean) 410 */ 411 public boolean hasIntercept() { 412 return hasIntercept; 413 } 414 415 /** 416 * Returns the slope of the estimated regression line. 417 * <p> 418 * The least squares estimate of the slope is computed using the 419 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 420 * The slope is sometimes denoted b1.</p> 421 * <p> 422 * <strong>Preconditions</strong>: <ul> 423 * <li>At least two observations (with at least two different x values) 424 * must have been added before invoking this method. If this method is 425 * invoked before a model can be estimated, <code>Double.NaN</code> is 426 * returned. 427 * </li></ul></p> 428 * 429 * @return the slope of the regression line 430 */ 431 public double getSlope() { 432 if (n < 2) { 433 return Double.NaN; //not enough data 434 } 435 if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) { 436 return Double.NaN; //not enough variation in x 437 } 438 return sumXY / sumXX; 439 } 440 441 /** 442 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 443 * sum of squared errors</a> (SSE) associated with the regression 444 * model. 445 * <p> 446 * The sum is computed using the computational formula</p> 447 * <p> 448 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> 449 * <p> 450 * where <code>SYY</code> is the sum of the squared deviations of the y 451 * values about their mean, <code>SXX</code> is similarly defined and 452 * <code>SXY</code> is the sum of the products of x and y mean deviations. 453 * </p><p> 454 * The sums are accumulated using the updating algorithm referenced in 455 * {@link #addData}.</p> 456 * <p> 457 * The return value is constrained to be non-negative - i.e., if due to 458 * rounding errors the computational formula returns a negative result, 459 * 0 is returned.</p> 460 * <p> 461 * <strong>Preconditions</strong>: <ul> 462 * <li>At least two observations (with at least two different x values) 463 * must have been added before invoking this method. If this method is 464 * invoked before a model can be estimated, <code>Double,NaN</code> is 465 * returned. 466 * </li></ul></p> 467 * 468 * @return sum of squared errors associated with the regression model 469 */ 470 public double getSumSquaredErrors() { 471 return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX); 472 } 473 474 /** 475 * Returns the sum of squared deviations of the y values about their mean. 476 * <p> 477 * This is defined as SSTO 478 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> 479 * <p> 480 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 481 * 482 * @return sum of squared deviations of y values 483 */ 484 public double getTotalSumSquares() { 485 if (n < 2) { 486 return Double.NaN; 487 } 488 return sumYY; 489 } 490 491 /** 492 * Returns the sum of squared deviations of the x values about their mean. 493 * 494 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p> 495 * 496 * @return sum of squared deviations of x values 497 */ 498 public double getXSumSquares() { 499 if (n < 2) { 500 return Double.NaN; 501 } 502 return sumXX; 503 } 504 505 /** 506 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. 507 * 508 * @return sum of cross products 509 */ 510 public double getSumOfCrossProducts() { 511 return sumXY; 512 } 513 514 /** 515 * Returns the sum of squared deviations of the predicted y values about 516 * their mean (which equals the mean of y). 517 * <p> 518 * This is usually abbreviated SSR or SSM. It is defined as SSM 519 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> 520 * <p> 521 * <strong>Preconditions</strong>: <ul> 522 * <li>At least two observations (with at least two different x values) 523 * must have been added before invoking this method. If this method is 524 * invoked before a model can be estimated, <code>Double.NaN</code> is 525 * returned. 526 * </li></ul></p> 527 * 528 * @return sum of squared deviations of predicted y values 529 */ 530 public double getRegressionSumSquares() { 531 return getRegressionSumSquares(getSlope()); 532 } 533 534 /** 535 * Returns the sum of squared errors divided by the degrees of freedom, 536 * usually abbreviated MSE. 537 * <p> 538 * If there are fewer than <strong>three</strong> data pairs in the model, 539 * or if there is no variation in <code>x</code>, this returns 540 * <code>Double.NaN</code>.</p> 541 * 542 * @return sum of squared deviations of y values 543 */ 544 public double getMeanSquareError() { 545 if (n < 3) { 546 return Double.NaN; 547 } 548 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1)); 549 } 550 551 /** 552 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 553 * Pearson's product moment correlation coefficient</a>, 554 * usually denoted r. 555 * <p> 556 * <strong>Preconditions</strong>: <ul> 557 * <li>At least two observations (with at least two different x values) 558 * must have been added before invoking this method. If this method is 559 * invoked before a model can be estimated, <code>Double,NaN</code> is 560 * returned. 561 * </li></ul></p> 562 * 563 * @return Pearson's r 564 */ 565 public double getR() { 566 double b1 = getSlope(); 567 double result = FastMath.sqrt(getRSquare()); 568 if (b1 < 0) { 569 result = -result; 570 } 571 return result; 572 } 573 574 /** 575 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 576 * coefficient of determination</a>, 577 * usually denoted r-square. 578 * <p> 579 * <strong>Preconditions</strong>: <ul> 580 * <li>At least two observations (with at least two different x values) 581 * must have been added before invoking this method. If this method is 582 * invoked before a model can be estimated, <code>Double,NaN</code> is 583 * returned. 584 * </li></ul></p> 585 * 586 * @return r-square 587 */ 588 public double getRSquare() { 589 double ssto = getTotalSumSquares(); 590 return (ssto - getSumSquaredErrors()) / ssto; 591 } 592 593 /** 594 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 595 * standard error of the intercept estimate</a>, 596 * usually denoted s(b0). 597 * <p> 598 * If there are fewer that <strong>three</strong> observations in the 599 * model, or if there is no variation in x, this returns 600 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is 601 * returned when the intercept is constrained to be zero 602 * 603 * @return standard error associated with intercept estimate 604 */ 605 public double getInterceptStdErr() { 606 if( !hasIntercept ){ 607 return Double.NaN; 608 } 609 return FastMath.sqrt( 610 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX)); 611 } 612 613 /** 614 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 615 * error of the slope estimate</a>, 616 * usually denoted s(b1). 617 * <p> 618 * If there are fewer that <strong>three</strong> data pairs in the model, 619 * or if there is no variation in x, this returns <code>Double.NaN</code>. 620 * </p> 621 * 622 * @return standard error associated with slope estimate 623 */ 624 public double getSlopeStdErr() { 625 return FastMath.sqrt(getMeanSquareError() / sumXX); 626 } 627 628 /** 629 * Returns the half-width of a 95% confidence interval for the slope 630 * estimate. 631 * <p> 632 * The 95% confidence interval is</p> 633 * <p> 634 * <code>(getSlope() - getSlopeConfidenceInterval(), 635 * getSlope() + getSlopeConfidenceInterval())</code></p> 636 * <p> 637 * If there are fewer that <strong>three</strong> observations in the 638 * model, or if there is no variation in x, this returns 639 * <code>Double.NaN</code>.</p> 640 * <p> 641 * <strong>Usage Note</strong>:<br> 642 * The validity of this statistic depends on the assumption that the 643 * observations included in the model are drawn from a 644 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 645 * Bivariate Normal Distribution</a>.</p> 646 * 647 * @return half-width of 95% confidence interval for the slope estimate 648 * @throws OutOfRangeException if the confidence interval can not be computed. 649 */ 650 public double getSlopeConfidenceInterval() throws OutOfRangeException { 651 return getSlopeConfidenceInterval(0.05d); 652 } 653 654 /** 655 * Returns the half-width of a (100-100*alpha)% confidence interval for 656 * the slope estimate. 657 * <p> 658 * The (100-100*alpha)% confidence interval is </p> 659 * <p> 660 * <code>(getSlope() - getSlopeConfidenceInterval(), 661 * getSlope() + getSlopeConfidenceInterval())</code></p> 662 * <p> 663 * To request, for example, a 99% confidence interval, use 664 * <code>alpha = .01</code></p> 665 * <p> 666 * <strong>Usage Note</strong>:<br> 667 * The validity of this statistic depends on the assumption that the 668 * observations included in the model are drawn from a 669 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 670 * Bivariate Normal Distribution</a>.</p> 671 * <p> 672 * <strong> Preconditions:</strong><ul> 673 * <li>If there are fewer that <strong>three</strong> observations in the 674 * model, or if there is no variation in x, this returns 675 * <code>Double.NaN</code>. 676 * </li> 677 * <li><code>(0 < alpha < 1)</code>; otherwise an 678 * <code>OutOfRangeException</code> is thrown. 679 * </li></ul></p> 680 * 681 * @param alpha the desired significance level 682 * @return half-width of 95% confidence interval for the slope estimate 683 * @throws OutOfRangeException if the confidence interval can not be computed. 684 */ 685 public double getSlopeConfidenceInterval(final double alpha) 686 throws OutOfRangeException { 687 if (n < 3) { 688 return Double.NaN; 689 } 690 if (alpha >= 1 || alpha <= 0) { 691 throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL, 692 alpha, 0, 1); 693 } 694 // No advertised NotStrictlyPositiveException here - will return NaN above 695 TDistribution distribution = new TDistribution(n - 2); 696 return getSlopeStdErr() * 697 distribution.inverseCumulativeProbability(1d - alpha / 2d); 698 } 699 700 /** 701 * Returns the significance level of the slope (equiv) correlation. 702 * <p> 703 * Specifically, the returned value is the smallest <code>alpha</code> 704 * such that the slope confidence interval with significance level 705 * equal to <code>alpha</code> does not include <code>0</code>. 706 * On regression output, this is often denoted <code>Prob(|t| > 0)</code> 707 * </p><p> 708 * <strong>Usage Note</strong>:<br> 709 * The validity of this statistic depends on the assumption that the 710 * observations included in the model are drawn from a 711 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 712 * Bivariate Normal Distribution</a>.</p> 713 * <p> 714 * If there are fewer that <strong>three</strong> observations in the 715 * model, or if there is no variation in x, this returns 716 * <code>Double.NaN</code>.</p> 717 * 718 * @return significance level for slope/correlation 719 * @throws org.apache.commons.math3.exception.MaxCountExceededException 720 * if the significance level can not be computed. 721 */ 722 public double getSignificance() { 723 if (n < 3) { 724 return Double.NaN; 725 } 726 // No advertised NotStrictlyPositiveException here - will return NaN above 727 TDistribution distribution = new TDistribution(n - 2); 728 return 2d * (1.0 - distribution.cumulativeProbability( 729 FastMath.abs(getSlope()) / getSlopeStdErr())); 730 } 731 732 // ---------------------Private methods----------------------------------- 733 734 /** 735 * Returns the intercept of the estimated regression line, given the slope. 736 * <p> 737 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> 738 * 739 * @param slope current slope 740 * @return the intercept of the regression line 741 */ 742 private double getIntercept(final double slope) { 743 if( hasIntercept){ 744 return (sumY - slope * sumX) / n; 745 } 746 return 0.0; 747 } 748 749 /** 750 * Computes SSR from b1. 751 * 752 * @param slope regression slope estimate 753 * @return sum of squared deviations of predicted y values 754 */ 755 private double getRegressionSumSquares(final double slope) { 756 return slope * slope * sumXX; 757 } 758 759 /** 760 * Performs a regression on data present in buffers and outputs a RegressionResults object. 761 * 762 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true 763 * a {@code NoDataException} is thrown. If there is no intercept term, the model must 764 * contain at least 2 observations.</p> 765 * 766 * @return RegressionResults acts as a container of regression output 767 * @throws ModelSpecificationException if the model is not correctly specified 768 * @throws NoDataException if there is not sufficient data in the model to 769 * estimate the regression parameters 770 */ 771 public RegressionResults regress() throws ModelSpecificationException, NoDataException { 772 if (hasIntercept) { 773 if (n < 3) { 774 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); 775 } 776 if (FastMath.abs(sumXX) > Precision.SAFE_MIN) { 777 final double[] params = new double[] { getIntercept(), getSlope() }; 778 final double mse = getMeanSquareError(); 779 final double _syy = sumYY + sumY * sumY / n; 780 final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX }; 781 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true, 782 false); 783 } else { 784 final double[] params = new double[] { sumY / n, Double.NaN }; 785 // final double mse = getMeanSquareError(); 786 final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN }; 787 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true, 788 false); 789 } 790 } else { 791 if (n < 2) { 792 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); 793 } 794 if (!Double.isNaN(sumXX)) { 795 final double[] vcv = new double[] { getMeanSquareError() / sumXX }; 796 final double[] params = new double[] { sumXY / sumXX }; 797 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false, 798 false); 799 } else { 800 final double[] vcv = new double[] { Double.NaN }; 801 final double[] params = new double[] { Double.NaN }; 802 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false, 803 false); 804 } 805 } 806 } 807 808 /** 809 * Performs a regression on data present in buffers including only regressors 810 * indexed in variablesToInclude and outputs a RegressionResults object 811 * @param variablesToInclude an array of indices of regressors to include 812 * @return RegressionResults acts as a container of regression output 813 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length 814 * @throws OutOfRangeException if a requested variable is not present in model 815 */ 816 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{ 817 if( variablesToInclude == null || variablesToInclude.length == 0){ 818 throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED); 819 } 820 if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){ 821 throw new ModelSpecificationException( 822 LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES, 823 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2); 824 } 825 826 if( hasIntercept ){ 827 if( variablesToInclude.length == 2 ){ 828 if( variablesToInclude[0] == 1 ){ 829 throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE); 830 }else if( variablesToInclude[0] != 0 ){ 831 throw new OutOfRangeException( variablesToInclude[0], 0,1 ); 832 } 833 if( variablesToInclude[1] != 1){ 834 throw new OutOfRangeException( variablesToInclude[0], 0,1 ); 835 } 836 return regress(); 837 }else{ 838 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){ 839 throw new OutOfRangeException( variablesToInclude[0],0,1 ); 840 } 841 final double _mean = sumY * sumY / n; 842 final double _syy = sumYY + _mean; 843 if( variablesToInclude[0] == 0 ){ 844 //just the mean 845 final double[] vcv = new double[]{ sumYY/(((n-1)*n)) }; 846 final double[] params = new double[]{ ybar }; 847 return new RegressionResults( 848 params, new double[][]{vcv}, true, n, 1, 849 sumY, _syy+_mean, sumYY,true,false); 850 851 }else if( variablesToInclude[0] == 1){ 852 //final double _syy = sumYY + sumY * sumY / ((double) n); 853 final double _sxx = sumXX + sumX * sumX / n; 854 final double _sxy = sumXY + sumX * sumY / n; 855 final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx); 856 final double _mse = _sse/((n-1)); 857 if( !Double.isNaN(_sxx) ){ 858 final double[] vcv = new double[]{ _mse / _sxx }; 859 final double[] params = new double[]{ _sxy/_sxx }; 860 return new RegressionResults( 861 params, new double[][]{vcv}, true, n, 1, 862 sumY, _syy, _sse,false,false); 863 }else{ 864 final double[] vcv = new double[]{Double.NaN }; 865 final double[] params = new double[]{ Double.NaN }; 866 return new RegressionResults( 867 params, new double[][]{vcv}, true, n, 1, 868 Double.NaN, Double.NaN, Double.NaN,false,false); 869 } 870 } 871 } 872 }else{ 873 if( variablesToInclude[0] != 0 ){ 874 throw new OutOfRangeException(variablesToInclude[0],0,0); 875 } 876 return regress(); 877 } 878 879 return null; 880 } 881}