001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math4.legacy.stat.regression; 019 020import org.apache.commons.statistics.distribution.TDistribution; 021import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException; 022import org.apache.commons.math4.legacy.exception.NoDataException; 023import org.apache.commons.math4.legacy.exception.OutOfRangeException; 024import org.apache.commons.math4.legacy.exception.util.LocalizedFormats; 025import org.apache.commons.math4.core.jdkmath.JdkMath; 026import org.apache.commons.numbers.core.Precision; 027 028/** 029 * Estimates an ordinary least squares regression model 030 * with one independent variable. 031 * <p> 032 * <code> y = intercept + slope * x </code></p> 033 * <p> 034 * Standard errors for <code>intercept</code> and <code>slope</code> are 035 * available as well as ANOVA, r-square and Pearson's r statistics.</p> 036 * <p> 037 * Observations (x,y pairs) can be added to the model one at a time or they 038 * can be provided in a 2-dimensional array. The observations are not stored 039 * in memory, so there is no limit to the number of observations that can be 040 * added to the model.</p> 041 * <p> 042 * <strong>Usage Notes</strong>: <ul> 043 * <li> When there are fewer than two observations in the model, or when 044 * there is no variation in the x values (i.e. all x values are the same) 045 * all statistics return <code>NaN</code>. At least two observations with 046 * different x coordinates are required to estimate a bivariate regression 047 * model. 048 * </li> 049 * <li> Getters for the statistics always compute values based on the current 050 * set of observations -- i.e., you can get statistics, then add more data 051 * and get updated statistics without using a new instance. There is no 052 * "compute" method that updates all statistics. Each of the getters performs 053 * the necessary computations to return the requested statistic. 054 * </li> 055 * <li> The intercept term may be suppressed by passing {@code false} to 056 * the {@link #SimpleRegression(boolean)} constructor. When the 057 * {@code hasIntercept} property is false, the model is estimated without a 058 * constant term and {@link #getIntercept()} returns {@code 0}.</li> 059 * </ul> 060 * 061 */ 062public class SimpleRegression implements UpdatingMultipleLinearRegression { 063 /** sum of x values. */ 064 private double sumX; 065 066 /** total variation in x (sum of squared deviations from xbar). */ 067 private double sumXX; 068 069 /** sum of y values. */ 070 private double sumY; 071 072 /** total variation in y (sum of squared deviations from ybar). */ 073 private double sumYY; 074 075 /** sum of products. */ 076 private double sumXY; 077 078 /** number of observations. */ 079 private long n; 080 081 /** mean of accumulated x values, used in updating formulas. */ 082 private double xbar; 083 084 /** mean of accumulated y values, used in updating formulas. */ 085 private double ybar; 086 087 /** include an intercept or not. */ 088 private final boolean hasIntercept; 089 // ---------------------Public methods-------------------------------------- 090 091 /** 092 * Create an empty SimpleRegression instance. 093 */ 094 public SimpleRegression() { 095 this(true); 096 } 097 /** 098 * Create a SimpleRegression instance, specifying whether or not to estimate 099 * an intercept. 100 * 101 * <p>Use {@code false} to estimate a model with no intercept. When the 102 * {@code hasIntercept} property is false, the model is estimated without a 103 * constant term and {@link #getIntercept()} returns {@code 0}.</p> 104 * 105 * @param includeIntercept whether or not to include an intercept term in 106 * the regression model 107 */ 108 public SimpleRegression(boolean includeIntercept) { 109 super(); 110 hasIntercept = includeIntercept; 111 } 112 113 /** 114 * Adds the observation (x,y) to the regression data set. 115 * <p> 116 * Uses updating formulas for means and sums of squares defined in 117 * "Algorithms for Computing the Sample Variance: Analysis and 118 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 119 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 120 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> 121 * 122 * 123 * @param x independent variable value 124 * @param y dependent variable value 125 */ 126 public void addData(final double x,final double y) { 127 if (n == 0) { 128 xbar = x; 129 ybar = y; 130 } else { 131 if( hasIntercept ){ 132 final double fact1 = 1.0 + n; 133 final double fact2 = n / (1.0 + n); 134 final double dx = x - xbar; 135 final double dy = y - ybar; 136 sumXX += dx * dx * fact2; 137 sumYY += dy * dy * fact2; 138 sumXY += dx * dy * fact2; 139 xbar += dx / fact1; 140 ybar += dy / fact1; 141 } 142 } 143 if( !hasIntercept ){ 144 sumXX += x * x ; 145 sumYY += y * y ; 146 sumXY += x * y ; 147 } 148 sumX += x; 149 sumY += y; 150 n++; 151 } 152 153 /** 154 * Appends data from another regression calculation to this one. 155 * 156 * <p>The mean update formulae are based on a paper written by Philippe 157 * Pébay: 158 * <a 159 * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf"> 160 * Formulas for Robust, One-Pass Parallel Computation of Covariances and 161 * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report 162 * SAND2008-6212, Sandia National Laboratories.</p> 163 * 164 * @param reg model to append data from 165 * @since 3.3 166 */ 167 public void append(SimpleRegression reg) { 168 if (n == 0) { 169 xbar = reg.xbar; 170 ybar = reg.ybar; 171 sumXX = reg.sumXX; 172 sumYY = reg.sumYY; 173 sumXY = reg.sumXY; 174 } else { 175 if (hasIntercept) { 176 final double fact1 = reg.n / (double) (reg.n + n); 177 final double fact2 = n * reg.n / (double) (reg.n + n); 178 final double dx = reg.xbar - xbar; 179 final double dy = reg.ybar - ybar; 180 sumXX += reg.sumXX + dx * dx * fact2; 181 sumYY += reg.sumYY + dy * dy * fact2; 182 sumXY += reg.sumXY + dx * dy * fact2; 183 xbar += dx * fact1; 184 ybar += dy * fact1; 185 }else{ 186 sumXX += reg.sumXX; 187 sumYY += reg.sumYY; 188 sumXY += reg.sumXY; 189 } 190 } 191 sumX += reg.sumX; 192 sumY += reg.sumY; 193 n += reg.n; 194 } 195 196 /** 197 * Removes the observation (x,y) from the regression data set. 198 * <p> 199 * Mirrors the addData method. This method permits the use of 200 * SimpleRegression instances in streaming mode where the regression 201 * is applied to a sliding "window" of observations, however the caller is 202 * responsible for maintaining the set of observations in the window.</p> 203 * 204 * The method has no effect if there are no points of data (i.e. n=0) 205 * 206 * @param x independent variable value 207 * @param y dependent variable value 208 */ 209 public void removeData(final double x,final double y) { 210 if (n > 0) { 211 if (hasIntercept) { 212 final double fact1 = n - 1.0; 213 final double fact2 = n / (n - 1.0); 214 final double dx = x - xbar; 215 final double dy = y - ybar; 216 sumXX -= dx * dx * fact2; 217 sumYY -= dy * dy * fact2; 218 sumXY -= dx * dy * fact2; 219 xbar -= dx / fact1; 220 ybar -= dy / fact1; 221 } else { 222 final double fact1 = n - 1.0; 223 sumXX -= x * x; 224 sumYY -= y * y; 225 sumXY -= x * y; 226 xbar -= x / fact1; 227 ybar -= y / fact1; 228 } 229 sumX -= x; 230 sumY -= y; 231 n--; 232 } 233 } 234 235 /** 236 * Adds the observations represented by the elements in 237 * <code>data</code>. 238 * <p> 239 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 240 * <code>(data[1][0],data[1][1])</code>, etc.</p> 241 * <p> 242 * This method does not replace data that has already been added. The 243 * observations represented by <code>data</code> are added to the existing 244 * dataset.</p> 245 * <p> 246 * To replace all data, use <code>clear()</code> before adding the new 247 * data.</p> 248 * 249 * @param data array of observations to be added 250 * @throws ModelSpecificationException if the length of {@code data[i]} is not 251 * greater than or equal to 2 252 */ 253 public void addData(final double[][] data) throws ModelSpecificationException { 254 for (int i = 0; i < data.length; i++) { 255 if( data[i].length < 2 ){ 256 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION, 257 data[i].length, 2); 258 } 259 addData(data[i][0], data[i][1]); 260 } 261 } 262 263 /** 264 * Adds one observation to the regression model. 265 * 266 * @param x the independent variables which form the design matrix 267 * @param y the dependent or response variable 268 * @throws ModelSpecificationException if the length of {@code x} does not equal 269 * the number of independent variables in the model 270 */ 271 @Override 272 public void addObservation(final double[] x,final double y) 273 throws ModelSpecificationException { 274 if( x == null || x.length == 0 ){ 275 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1); 276 } 277 addData( x[0], y ); 278 } 279 280 /** 281 * Adds a series of observations to the regression model. The lengths of 282 * x and y must be the same and x must be rectangular. 283 * 284 * @param x a series of observations on the independent variables 285 * @param y a series of observations on the dependent variable 286 * The length of x and y must be the same 287 * @throws ModelSpecificationException if {@code x} is not rectangular, does not match 288 * the length of {@code y} or does not contain sufficient data to estimate the model 289 */ 290 @Override 291 public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException { 292 if (x == null || y == null || x.length != y.length) { 293 throw new ModelSpecificationException( 294 LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE, 295 (x == null) ? 0 : x.length, 296 (y == null) ? 0 : y.length); 297 } 298 boolean obsOk = true; 299 for( int i = 0 ; i < x.length; i++){ 300 if( x[i] == null || x[i].length == 0 ){ 301 obsOk = false; 302 break; 303 } 304 } 305 if( !obsOk ){ 306 throw new ModelSpecificationException( 307 LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, 308 0, 1); 309 } 310 for( int i = 0 ; i < x.length ; i++){ 311 addData( x[i][0], y[i] ); 312 } 313 } 314 315 /** 316 * Removes observations represented by the elements in <code>data</code>. 317 * <p> 318 * If the array is larger than the current n, only the first n elements are 319 * processed. This method permits the use of SimpleRegression instances in 320 * streaming mode where the regression is applied to a sliding "window" of 321 * observations, however the caller is responsible for maintaining the set 322 * of observations in the window.</p> 323 * <p> 324 * To remove all data, use <code>clear()</code>.</p> 325 * 326 * @param data array of observations to be removed 327 */ 328 public void removeData(double[][] data) { 329 for (int i = 0; i < data.length && n > 0; i++) { 330 removeData(data[i][0], data[i][1]); 331 } 332 } 333 334 /** 335 * Clears all data from the model. 336 */ 337 @Override 338 public void clear() { 339 sumX = 0d; 340 sumXX = 0d; 341 sumY = 0d; 342 sumYY = 0d; 343 sumXY = 0d; 344 n = 0; 345 } 346 347 /** 348 * Returns the number of observations that have been added to the model. 349 * 350 * @return n number of observations that have been added. 351 */ 352 @Override 353 public long getN() { 354 return n; 355 } 356 357 /** 358 * Returns the "predicted" <code>y</code> value associated with the 359 * supplied <code>x</code> value, based on the data that has been 360 * added to the model when this method is activated. 361 * <p> 362 * <code> predict(x) = intercept + slope * x </code></p> 363 * <p> 364 * <strong>Preconditions</strong>: <ul> 365 * <li>At least two observations (with at least two different x values) 366 * must have been added before invoking this method. If this method is 367 * invoked before a model can be estimated, <code>Double,NaN</code> is 368 * returned. 369 * </li></ul> 370 * 371 * @param x input <code>x</code> value 372 * @return predicted <code>y</code> value 373 */ 374 public double predict(final double x) { 375 final double b1 = getSlope(); 376 if (hasIntercept) { 377 return getIntercept(b1) + b1 * x; 378 } 379 return b1 * x; 380 } 381 382 /** 383 * Returns the intercept of the estimated regression line, if 384 * {@link #hasIntercept()} is true; otherwise 0. 385 * <p> 386 * The least squares estimate of the intercept is computed using the 387 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 388 * The intercept is sometimes denoted b0.</p> 389 * <p> 390 * <strong>Preconditions</strong>: <ul> 391 * <li>At least two observations (with at least two different x values) 392 * must have been added before invoking this method. If this method is 393 * invoked before a model can be estimated, <code>Double,NaN</code> is 394 * returned. 395 * </li></ul> 396 * 397 * @return the intercept of the regression line if the model includes an 398 * intercept; 0 otherwise 399 * @see #SimpleRegression(boolean) 400 */ 401 public double getIntercept() { 402 return hasIntercept ? getIntercept(getSlope()) : 0.0; 403 } 404 405 /** 406 * Returns true if the model includes an intercept term. 407 * 408 * @return true if the regression includes an intercept; false otherwise 409 * @see #SimpleRegression(boolean) 410 */ 411 @Override 412 public boolean hasIntercept() { 413 return hasIntercept; 414 } 415 416 /** 417 * Returns the slope of the estimated regression line. 418 * <p> 419 * The least squares estimate of the slope is computed using the 420 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 421 * The slope is sometimes denoted b1.</p> 422 * <p> 423 * <strong>Preconditions</strong>: <ul> 424 * <li>At least two observations (with at least two different x values) 425 * must have been added before invoking this method. If this method is 426 * invoked before a model can be estimated, <code>Double.NaN</code> is 427 * returned. 428 * </li></ul> 429 * 430 * @return the slope of the regression line 431 */ 432 public double getSlope() { 433 if (n < 2) { 434 return Double.NaN; //not enough data 435 } 436 if (JdkMath.abs(sumXX) < 10 * Double.MIN_VALUE) { 437 return Double.NaN; //not enough variation in x 438 } 439 return sumXY / sumXX; 440 } 441 442 /** 443 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 444 * sum of squared errors</a> (SSE) associated with the regression 445 * model. 446 * <p> 447 * The sum is computed using the computational formula</p> 448 * <p> 449 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> 450 * <p> 451 * where <code>SYY</code> is the sum of the squared deviations of the y 452 * values about their mean, <code>SXX</code> is similarly defined and 453 * <code>SXY</code> is the sum of the products of x and y mean deviations. 454 * </p><p> 455 * The sums are accumulated using the updating algorithm referenced in 456 * {@link #addData}.</p> 457 * <p> 458 * The return value is constrained to be non-negative - i.e., if due to 459 * rounding errors the computational formula returns a negative result, 460 * 0 is returned.</p> 461 * <p> 462 * <strong>Preconditions</strong>: <ul> 463 * <li>At least two observations (with at least two different x values) 464 * must have been added before invoking this method. If this method is 465 * invoked before a model can be estimated, <code>Double,NaN</code> is 466 * returned. 467 * </li></ul> 468 * 469 * @return sum of squared errors associated with the regression model 470 */ 471 public double getSumSquaredErrors() { 472 return JdkMath.max(0d, sumYY - sumXY * sumXY / sumXX); 473 } 474 475 /** 476 * Returns the sum of squared deviations of the y values about their mean. 477 * <p> 478 * This is defined as SSTO 479 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> 480 * <p> 481 * If {@code n < 2}, this returns <code>Double.NaN</code>.</p> 482 * 483 * @return sum of squared deviations of y values 484 */ 485 public double getTotalSumSquares() { 486 if (n < 2) { 487 return Double.NaN; 488 } 489 return sumYY; 490 } 491 492 /** 493 * Returns the sum of squared deviations of the x values about their mean. 494 * 495 * If <code>n < 2</code>, this returns <code>Double.NaN</code>. 496 * 497 * @return sum of squared deviations of x values 498 */ 499 public double getXSumSquares() { 500 if (n < 2) { 501 return Double.NaN; 502 } 503 return sumXX; 504 } 505 506 /** 507 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. 508 * 509 * @return sum of cross products 510 */ 511 public double getSumOfCrossProducts() { 512 return sumXY; 513 } 514 515 /** 516 * Returns the sum of squared deviations of the predicted y values about 517 * their mean (which equals the mean of y). 518 * <p> 519 * This is usually abbreviated SSR or SSM. It is defined as SSM 520 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> 521 * <p> 522 * <strong>Preconditions</strong>: <ul> 523 * <li>At least two observations (with at least two different x values) 524 * must have been added before invoking this method. If this method is 525 * invoked before a model can be estimated, <code>Double.NaN</code> is 526 * returned. 527 * </li></ul> 528 * 529 * @return sum of squared deviations of predicted y values 530 */ 531 public double getRegressionSumSquares() { 532 return getRegressionSumSquares(getSlope()); 533 } 534 535 /** 536 * Returns the sum of squared errors divided by the degrees of freedom, 537 * usually abbreviated MSE. 538 * <p> 539 * If there are fewer than <strong>three</strong> data pairs in the model, 540 * or if there is no variation in <code>x</code>, this returns 541 * <code>Double.NaN</code>.</p> 542 * 543 * @return sum of squared deviations of y values 544 */ 545 public double getMeanSquareError() { 546 if (n < 3) { 547 return Double.NaN; 548 } 549 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1)); 550 } 551 552 /** 553 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 554 * Pearson's product moment correlation coefficient</a>, 555 * usually denoted r. 556 * <p> 557 * <strong>Preconditions</strong>: <ul> 558 * <li>At least two observations (with at least two different x values) 559 * must have been added before invoking this method. If this method is 560 * invoked before a model can be estimated, <code>Double,NaN</code> is 561 * returned. 562 * </li></ul> 563 * 564 * @return Pearson's r 565 */ 566 public double getR() { 567 double b1 = getSlope(); 568 double result = JdkMath.sqrt(getRSquare()); 569 if (b1 < 0) { 570 result = -result; 571 } 572 return result; 573 } 574 575 /** 576 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 577 * coefficient of determination</a>, 578 * usually denoted r-square. 579 * <p> 580 * <strong>Preconditions</strong>: <ul> 581 * <li>At least two observations (with at least two different x values) 582 * must have been added before invoking this method. If this method is 583 * invoked before a model can be estimated, <code>Double,NaN</code> is 584 * returned. 585 * </li></ul> 586 * 587 * @return r-square 588 */ 589 public double getRSquare() { 590 double ssto = getTotalSumSquares(); 591 return (ssto - getSumSquaredErrors()) / ssto; 592 } 593 594 /** 595 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 596 * standard error of the intercept estimate</a>, 597 * usually denoted s(b0). 598 * <p> 599 * If there are fewer that <strong>three</strong> observations in the 600 * model, or if there is no variation in x, this returns 601 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is 602 * returned when the intercept is constrained to be zero 603 * 604 * @return standard error associated with intercept estimate 605 */ 606 public double getInterceptStdErr() { 607 if( !hasIntercept ){ 608 return Double.NaN; 609 } 610 return JdkMath.sqrt( 611 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX)); 612 } 613 614 /** 615 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 616 * error of the slope estimate</a>, 617 * usually denoted s(b1). 618 * <p> 619 * If there are fewer that <strong>three</strong> data pairs in the model, 620 * or if there is no variation in x, this returns <code>Double.NaN</code>. 621 * </p> 622 * 623 * @return standard error associated with slope estimate 624 */ 625 public double getSlopeStdErr() { 626 return JdkMath.sqrt(getMeanSquareError() / sumXX); 627 } 628 629 /** 630 * Returns the half-width of a 95% confidence interval for the slope 631 * estimate. 632 * <p> 633 * The 95% confidence interval is</p> 634 * <p> 635 * <code>(getSlope() - getSlopeConfidenceInterval(), 636 * getSlope() + getSlopeConfidenceInterval())</code></p> 637 * <p> 638 * If there are fewer that <strong>three</strong> observations in the 639 * model, or if there is no variation in x, this returns 640 * <code>Double.NaN</code>.</p> 641 * <p> 642 * <strong>Usage Note</strong>:<br> 643 * The validity of this statistic depends on the assumption that the 644 * observations included in the model are drawn from a 645 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 646 * Bivariate Normal Distribution</a>.</p> 647 * 648 * @return half-width of 95% confidence interval for the slope estimate 649 * @throws OutOfRangeException if the confidence interval can not be computed. 650 */ 651 public double getSlopeConfidenceInterval() throws OutOfRangeException { 652 return getSlopeConfidenceInterval(0.05d); 653 } 654 655 /** 656 * Returns the half-width of a (100-100*alpha)% confidence interval for 657 * the slope estimate. 658 * <p> 659 * The (100-100*alpha)% confidence interval is </p> 660 * <p> 661 * <code>(getSlope() - getSlopeConfidenceInterval(), 662 * getSlope() + getSlopeConfidenceInterval())</code></p> 663 * <p> 664 * To request, for example, a 99% confidence interval, use 665 * <code>alpha = .01</code></p> 666 * <p> 667 * <strong>Usage Note</strong>:<br> 668 * The validity of this statistic depends on the assumption that the 669 * observations included in the model are drawn from a 670 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 671 * Bivariate Normal Distribution</a>.</p> 672 * <p> 673 * <strong> Preconditions:</strong><ul> 674 * <li>If there are fewer that <strong>three</strong> observations in the 675 * model, or if there is no variation in x, this returns 676 * <code>Double.NaN</code>. 677 * </li> 678 * <li>{@code (0 < alpha < 1)}; otherwise an 679 * <code>OutOfRangeException</code> is thrown. 680 * </li></ul> 681 * 682 * @param alpha the desired significance level 683 * @return half-width of 95% confidence interval for the slope estimate 684 * @throws OutOfRangeException if the confidence interval can not be computed. 685 */ 686 public double getSlopeConfidenceInterval(final double alpha) 687 throws OutOfRangeException { 688 if (n < 3) { 689 return Double.NaN; 690 } 691 if (alpha >= 1 || alpha <= 0) { 692 throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL, 693 alpha, 0, 1); 694 } 695 // No advertised NotStrictlyPositiveException here - will return NaN above 696 TDistribution distribution = TDistribution.of(n - 2d); 697 return getSlopeStdErr() * 698 distribution.inverseSurvivalProbability(alpha / 2d); 699 } 700 701 /** 702 * Returns the significance level of the slope (equiv) correlation. 703 * <p> 704 * Specifically, the returned value is the smallest <code>alpha</code> 705 * such that the slope confidence interval with significance level 706 * equal to <code>alpha</code> does not include <code>0</code>. 707 * On regression output, this is often denoted {@code Prob(|t| > 0)} 708 * </p><p> 709 * <strong>Usage Note</strong>:<br> 710 * The validity of this statistic depends on the assumption that the 711 * observations included in the model are drawn from a 712 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 713 * Bivariate Normal Distribution</a>.</p> 714 * <p> 715 * If there are fewer that <strong>three</strong> observations in the 716 * model, or if there is no variation in x, this returns 717 * <code>Double.NaN</code>.</p> 718 * 719 * @return significance level for slope/correlation 720 * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException 721 * if the significance level can not be computed. 722 */ 723 public double getSignificance() { 724 if (n < 3) { 725 return Double.NaN; 726 } 727 // No advertised NotStrictlyPositiveException here - will return NaN above 728 TDistribution distribution = TDistribution.of(n - 2d); 729 return 2d * (distribution.survivalProbability( 730 JdkMath.abs(getSlope()) / getSlopeStdErr())); 731 } 732 733 // ---------------------Private methods----------------------------------- 734 735 /** 736 * Returns the intercept of the estimated regression line, given the slope. 737 * <p> 738 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> 739 * 740 * @param slope current slope 741 * @return the intercept of the regression line 742 */ 743 private double getIntercept(final double slope) { 744 if( hasIntercept){ 745 return (sumY - slope * sumX) / n; 746 } 747 return 0.0; 748 } 749 750 /** 751 * Computes SSR from b1. 752 * 753 * @param slope regression slope estimate 754 * @return sum of squared deviations of predicted y values 755 */ 756 private double getRegressionSumSquares(final double slope) { 757 return slope * slope * sumXX; 758 } 759 760 /** 761 * Performs a regression on data present in buffers and outputs a RegressionResults object. 762 * 763 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true 764 * a {@code NoDataException} is thrown. If there is no intercept term, the model must 765 * contain at least 2 observations.</p> 766 * 767 * @return RegressionResults acts as a container of regression output 768 * @throws ModelSpecificationException if the model is not correctly specified 769 * @throws NoDataException if there is not sufficient data in the model to 770 * estimate the regression parameters 771 */ 772 @Override 773 public RegressionResults regress() throws ModelSpecificationException, NoDataException { 774 if (hasIntercept) { 775 if (n < 3) { 776 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); 777 } 778 if (JdkMath.abs(sumXX) > Precision.SAFE_MIN) { 779 final double[] params = new double[] { getIntercept(), getSlope() }; 780 final double mse = getMeanSquareError(); 781 final double syy = sumYY + sumY * sumY / n; 782 final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX }; 783 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, syy, getSumSquaredErrors(), true, 784 false); 785 } else { 786 final double[] params = new double[] { sumY / n, Double.NaN }; 787 // final double mse = getMeanSquareError(); 788 final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN }; 789 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true, 790 false); 791 } 792 } else { 793 if (n < 2) { 794 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION); 795 } 796 if (!Double.isNaN(sumXX)) { 797 final double[] vcv = new double[] { getMeanSquareError() / sumXX }; 798 final double[] params = new double[] { sumXY / sumXX }; 799 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false, 800 false); 801 } else { 802 final double[] vcv = new double[] { Double.NaN }; 803 final double[] params = new double[] { Double.NaN }; 804 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false, 805 false); 806 } 807 } 808 } 809 810 /** 811 * Performs a regression on data present in buffers including only regressors. 812 * indexed in variablesToInclude and outputs a RegressionResults object 813 * @param variablesToInclude an array of indices of regressors to include 814 * @return RegressionResults acts as a container of regression output 815 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length 816 * @throws OutOfRangeException if a requested variable is not present in model 817 */ 818 @Override 819 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{ 820 if( variablesToInclude == null || variablesToInclude.length == 0){ 821 throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED); 822 } 823 if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){ 824 throw new ModelSpecificationException( 825 LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES, 826 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2); 827 } 828 829 if( hasIntercept ){ 830 if( variablesToInclude.length == 2 ){ 831 if( variablesToInclude[0] == 1 ){ 832 throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE); 833 }else if( variablesToInclude[0] != 0 ){ 834 throw new OutOfRangeException( variablesToInclude[0], 0,1 ); 835 } 836 if( variablesToInclude[1] != 1){ 837 throw new OutOfRangeException( variablesToInclude[0], 0,1 ); 838 } 839 return regress(); 840 }else{ 841 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){ 842 throw new OutOfRangeException( variablesToInclude[0],0,1 ); 843 } 844 final double mean = sumY * sumY / n; 845 final double syy = sumYY + mean; 846 if( variablesToInclude[0] == 0 ){ 847 //just the mean 848 final double[] vcv = new double[]{ sumYY/(((n-1)*n)) }; 849 final double[] params = new double[]{ ybar }; 850 return new RegressionResults( 851 params, new double[][]{vcv}, true, n, 1, 852 sumY, syy+mean, sumYY,true,false); 853 }else if( variablesToInclude[0] == 1){ 854 //final double _syy = sumYY + sumY * sumY / ((double) n); 855 final double sxx = sumXX + sumX * sumX / n; 856 final double sxy = sumXY + sumX * sumY / n; 857 final double sse = JdkMath.max(0d, syy - sxy * sxy / sxx); 858 final double mse = sse/((n-1)); 859 if( !Double.isNaN(sxx) ){ 860 final double[] vcv = new double[]{ mse / sxx }; 861 final double[] params = new double[]{ sxy/sxx }; 862 return new RegressionResults( 863 params, new double[][]{vcv}, true, n, 1, 864 sumY, syy, sse,false,false); 865 }else{ 866 final double[] vcv = new double[]{Double.NaN }; 867 final double[] params = new double[]{ Double.NaN }; 868 return new RegressionResults( 869 params, new double[][]{vcv}, true, n, 1, 870 Double.NaN, Double.NaN, Double.NaN,false,false); 871 } 872 } 873 } 874 }else{ 875 if( variablesToInclude[0] != 0 ){ 876 throw new OutOfRangeException(variablesToInclude[0],0,0); 877 } 878 return regress(); 879 } 880 881 return null; 882 } 883}