SimpleRegression.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.math4.legacy.stat.regression;

  18. import org.apache.commons.statistics.distribution.TDistribution;
  19. import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
  20. import org.apache.commons.math4.legacy.exception.NoDataException;
  21. import org.apache.commons.math4.legacy.exception.OutOfRangeException;
  22. import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
  23. import org.apache.commons.math4.core.jdkmath.JdkMath;
  24. import org.apache.commons.numbers.core.Precision;

  25. /**
  26.  * Estimates an ordinary least squares regression model
  27.  * with one independent variable.
  28.  * <p>
  29.  * <code> y = intercept + slope * x  </code></p>
  30.  * <p>
  31.  * Standard errors for <code>intercept</code> and <code>slope</code> are
  32.  * available as well as ANOVA, r-square and Pearson's r statistics.</p>
  33.  * <p>
  34.  * Observations (x,y pairs) can be added to the model one at a time or they
  35.  * can be provided in a 2-dimensional array.  The observations are not stored
  36.  * in memory, so there is no limit to the number of observations that can be
  37.  * added to the model.</p>
  38.  * <p>
  39.  * <strong>Usage Notes</strong>: <ul>
  40.  * <li> When there are fewer than two observations in the model, or when
  41.  * there is no variation in the x values (i.e. all x values are the same)
  42.  * all statistics return <code>NaN</code>. At least two observations with
  43.  * different x coordinates are required to estimate a bivariate regression
  44.  * model.
  45.  * </li>
  46.  * <li> Getters for the statistics always compute values based on the current
  47.  * set of observations -- i.e., you can get statistics, then add more data
  48.  * and get updated statistics without using a new instance.  There is no
  49.  * "compute" method that updates all statistics.  Each of the getters performs
  50.  * the necessary computations to return the requested statistic.
  51.  * </li>
  52.  * <li> The intercept term may be suppressed by passing {@code false} to
  53.  * the {@link #SimpleRegression(boolean)} constructor.  When the
  54.  * {@code hasIntercept} property is false, the model is estimated without a
  55.  * constant term and {@link #getIntercept()} returns {@code 0}.</li>
  56.  * </ul>
  57.  *
  58.  */
  59. public class SimpleRegression implements UpdatingMultipleLinearRegression {
  60.     /** sum of x values. */
  61.     private double sumX;

  62.     /** total variation in x (sum of squared deviations from xbar). */
  63.     private double sumXX;

  64.     /** sum of y values. */
  65.     private double sumY;

  66.     /** total variation in y (sum of squared deviations from ybar). */
  67.     private double sumYY;

  68.     /** sum of products. */
  69.     private double sumXY;

  70.     /** number of observations. */
  71.     private long n;

  72.     /** mean of accumulated x values, used in updating formulas. */
  73.     private double xbar;

  74.     /** mean of accumulated y values, used in updating formulas. */
  75.     private double ybar;

  76.     /** include an intercept or not. */
  77.     private final boolean hasIntercept;
  78.     // ---------------------Public methods--------------------------------------

  79.     /**
  80.      * Create an empty SimpleRegression instance.
  81.      */
  82.     public SimpleRegression() {
  83.         this(true);
  84.     }
  85.     /**
  86.     * Create a SimpleRegression instance, specifying whether or not to estimate
  87.     * an intercept.
  88.     *
  89.     * <p>Use {@code false} to estimate a model with no intercept.  When the
  90.     * {@code hasIntercept} property is false, the model is estimated without a
  91.     * constant term and {@link #getIntercept()} returns {@code 0}.</p>
  92.     *
  93.     * @param includeIntercept whether or not to include an intercept term in
  94.     * the regression model
  95.     */
  96.     public SimpleRegression(boolean includeIntercept) {
  97.         super();
  98.         hasIntercept = includeIntercept;
  99.     }

  100.     /**
  101.      * Adds the observation (x,y) to the regression data set.
  102.      * <p>
  103.      * Uses updating formulas for means and sums of squares defined in
  104.      * "Algorithms for Computing the Sample Variance: Analysis and
  105.      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
  106.      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
  107.      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
  108.      *
  109.      *
  110.      * @param x independent variable value
  111.      * @param y dependent variable value
  112.      */
  113.     public void addData(final double x,final double y) {
  114.         if (n == 0) {
  115.             xbar = x;
  116.             ybar = y;
  117.         } else {
  118.             if( hasIntercept ){
  119.                 final double fact1 = 1.0 + n;
  120.                 final double fact2 = n / (1.0 + n);
  121.                 final double dx = x - xbar;
  122.                 final double dy = y - ybar;
  123.                 sumXX += dx * dx * fact2;
  124.                 sumYY += dy * dy * fact2;
  125.                 sumXY += dx * dy * fact2;
  126.                 xbar += dx / fact1;
  127.                 ybar += dy / fact1;
  128.             }
  129.          }
  130.         if( !hasIntercept ){
  131.             sumXX += x * x ;
  132.             sumYY += y * y ;
  133.             sumXY += x * y ;
  134.         }
  135.         sumX += x;
  136.         sumY += y;
  137.         n++;
  138.     }

  139.     /**
  140.      * Appends data from another regression calculation to this one.
  141.      *
  142.      * <p>The mean update formulae are based on a paper written by Philippe
  143.      * P&eacute;bay:
  144.      * <a
  145.      * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
  146.      * Formulas for Robust, One-Pass Parallel Computation of Covariances and
  147.      * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
  148.      * SAND2008-6212, Sandia National Laboratories.</p>
  149.      *
  150.      * @param reg model to append data from
  151.      * @since 3.3
  152.      */
  153.     public void append(SimpleRegression reg) {
  154.         if (n == 0) {
  155.             xbar = reg.xbar;
  156.             ybar = reg.ybar;
  157.             sumXX = reg.sumXX;
  158.             sumYY = reg.sumYY;
  159.             sumXY = reg.sumXY;
  160.         } else {
  161.             if (hasIntercept) {
  162.                 final double fact1 = reg.n / (double) (reg.n + n);
  163.                 final double fact2 = n * reg.n / (double) (reg.n + n);
  164.                 final double dx = reg.xbar - xbar;
  165.                 final double dy = reg.ybar - ybar;
  166.                 sumXX += reg.sumXX + dx * dx * fact2;
  167.                 sumYY += reg.sumYY + dy * dy * fact2;
  168.                 sumXY += reg.sumXY + dx * dy * fact2;
  169.                 xbar += dx * fact1;
  170.                 ybar += dy * fact1;
  171.             }else{
  172.                 sumXX += reg.sumXX;
  173.                 sumYY += reg.sumYY;
  174.                 sumXY += reg.sumXY;
  175.             }
  176.         }
  177.         sumX += reg.sumX;
  178.         sumY += reg.sumY;
  179.         n += reg.n;
  180.     }

  181.     /**
  182.      * Removes the observation (x,y) from the regression data set.
  183.      * <p>
  184.      * Mirrors the addData method.  This method permits the use of
  185.      * SimpleRegression instances in streaming mode where the regression
  186.      * is applied to a sliding "window" of observations, however the caller is
  187.      * responsible for maintaining the set of observations in the window.</p>
  188.      *
  189.      * The method has no effect if there are no points of data (i.e. n=0)
  190.      *
  191.      * @param x independent variable value
  192.      * @param y dependent variable value
  193.      */
  194.     public void removeData(final double x,final double y) {
  195.         if (n > 0) {
  196.             if (hasIntercept) {
  197.                 final double fact1 = n - 1.0;
  198.                 final double fact2 = n / (n - 1.0);
  199.                 final double dx = x - xbar;
  200.                 final double dy = y - ybar;
  201.                 sumXX -= dx * dx * fact2;
  202.                 sumYY -= dy * dy * fact2;
  203.                 sumXY -= dx * dy * fact2;
  204.                 xbar -= dx / fact1;
  205.                 ybar -= dy / fact1;
  206.             } else {
  207.                 final double fact1 = n - 1.0;
  208.                 sumXX -= x * x;
  209.                 sumYY -= y * y;
  210.                 sumXY -= x * y;
  211.                 xbar -= x / fact1;
  212.                 ybar -= y / fact1;
  213.             }
  214.              sumX -= x;
  215.              sumY -= y;
  216.              n--;
  217.         }
  218.     }

  219.     /**
  220.      * Adds the observations represented by the elements in
  221.      * <code>data</code>.
  222.      * <p>
  223.      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
  224.      * <code>(data[1][0],data[1][1])</code>, etc.</p>
  225.      * <p>
  226.      * This method does not replace data that has already been added.  The
  227.      * observations represented by <code>data</code> are added to the existing
  228.      * dataset.</p>
  229.      * <p>
  230.      * To replace all data, use <code>clear()</code> before adding the new
  231.      * data.</p>
  232.      *
  233.      * @param data array of observations to be added
  234.      * @throws ModelSpecificationException if the length of {@code data[i]} is not
  235.      * greater than or equal to 2
  236.      */
  237.     public void addData(final double[][] data) throws ModelSpecificationException {
  238.         for (int i = 0; i < data.length; i++) {
  239.             if( data[i].length < 2 ){
  240.                throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
  241.                     data[i].length, 2);
  242.             }
  243.             addData(data[i][0], data[i][1]);
  244.         }
  245.     }

  246.     /**
  247.      * Adds one observation to the regression model.
  248.      *
  249.      * @param x the independent variables which form the design matrix
  250.      * @param y the dependent or response variable
  251.      * @throws ModelSpecificationException if the length of {@code x} does not equal
  252.      * the number of independent variables in the model
  253.      */
  254.     @Override
  255.     public void addObservation(final double[] x,final double y)
  256.             throws ModelSpecificationException {
  257.         if( x == null || x.length == 0 ){
  258.             throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
  259.         }
  260.         addData( x[0], y );
  261.     }

  262.     /**
  263.      * Adds a series of observations to the regression model. The lengths of
  264.      * x and y must be the same and x must be rectangular.
  265.      *
  266.      * @param x a series of observations on the independent variables
  267.      * @param y a series of observations on the dependent variable
  268.      * The length of x and y must be the same
  269.      * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
  270.      * the length of {@code y} or does not contain sufficient data to estimate the model
  271.      */
  272.     @Override
  273.     public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
  274.         if (x == null || y == null || x.length != y.length) {
  275.             throw new ModelSpecificationException(
  276.                   LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
  277.                   (x == null) ? 0 : x.length,
  278.                   (y == null) ? 0 : y.length);
  279.         }
  280.         boolean obsOk = true;
  281.         for( int i = 0 ; i < x.length; i++){
  282.             if( x[i] == null || x[i].length == 0 ){
  283.                 obsOk = false;
  284.                 break;
  285.             }
  286.         }
  287.         if( !obsOk ){
  288.             throw new ModelSpecificationException(
  289.                   LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
  290.                   0, 1);
  291.         }
  292.         for( int i = 0 ; i < x.length ; i++){
  293.             addData( x[i][0], y[i] );
  294.         }
  295.     }

  296.     /**
  297.      * Removes observations represented by the elements in <code>data</code>.
  298.       * <p>
  299.      * If the array is larger than the current n, only the first n elements are
  300.      * processed.  This method permits the use of SimpleRegression instances in
  301.      * streaming mode where the regression is applied to a sliding "window" of
  302.      * observations, however the caller is responsible for maintaining the set
  303.      * of observations in the window.</p>
  304.      * <p>
  305.      * To remove all data, use <code>clear()</code>.</p>
  306.      *
  307.      * @param data array of observations to be removed
  308.      */
  309.     public void removeData(double[][] data) {
  310.         for (int i = 0; i < data.length && n > 0; i++) {
  311.             removeData(data[i][0], data[i][1]);
  312.         }
  313.     }

  314.     /**
  315.      * Clears all data from the model.
  316.      */
  317.     @Override
  318.     public void clear() {
  319.         sumX = 0d;
  320.         sumXX = 0d;
  321.         sumY = 0d;
  322.         sumYY = 0d;
  323.         sumXY = 0d;
  324.         n = 0;
  325.     }

  326.     /**
  327.      * Returns the number of observations that have been added to the model.
  328.      *
  329.      * @return n number of observations that have been added.
  330.      */
  331.     @Override
  332.     public long getN() {
  333.         return n;
  334.     }

  335.     /**
  336.      * Returns the "predicted" <code>y</code> value associated with the
  337.      * supplied <code>x</code> value,  based on the data that has been
  338.      * added to the model when this method is activated.
  339.      * <p>
  340.      * <code> predict(x) = intercept + slope * x </code></p>
  341.      * <p>
  342.      * <strong>Preconditions</strong>: <ul>
  343.      * <li>At least two observations (with at least two different x values)
  344.      * must have been added before invoking this method. If this method is
  345.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  346.      * returned.
  347.      * </li></ul>
  348.      *
  349.      * @param x input <code>x</code> value
  350.      * @return predicted <code>y</code> value
  351.      */
  352.     public double predict(final double x) {
  353.         final double b1 = getSlope();
  354.         if (hasIntercept) {
  355.             return getIntercept(b1) + b1 * x;
  356.         }
  357.         return b1 * x;
  358.     }

  359.     /**
  360.      * Returns the intercept of the estimated regression line, if
  361.      * {@link #hasIntercept()} is true; otherwise 0.
  362.      * <p>
  363.      * The least squares estimate of the intercept is computed using the
  364.      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
  365.      * The intercept is sometimes denoted b0.</p>
  366.      * <p>
  367.      * <strong>Preconditions</strong>: <ul>
  368.      * <li>At least two observations (with at least two different x values)
  369.      * must have been added before invoking this method. If this method is
  370.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  371.      * returned.
  372.      * </li></ul>
  373.      *
  374.      * @return the intercept of the regression line if the model includes an
  375.      * intercept; 0 otherwise
  376.      * @see #SimpleRegression(boolean)
  377.      */
  378.     public double getIntercept() {
  379.         return hasIntercept ? getIntercept(getSlope()) : 0.0;
  380.     }

  381.     /**
  382.      * Returns true if the model includes an intercept term.
  383.      *
  384.      * @return true if the regression includes an intercept; false otherwise
  385.      * @see #SimpleRegression(boolean)
  386.      */
  387.     @Override
  388.     public boolean hasIntercept() {
  389.         return hasIntercept;
  390.     }

  391.     /**
  392.     * Returns the slope of the estimated regression line.
  393.     * <p>
  394.     * The least squares estimate of the slope is computed using the
  395.     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
  396.     * The slope is sometimes denoted b1.</p>
  397.     * <p>
  398.     * <strong>Preconditions</strong>: <ul>
  399.     * <li>At least two observations (with at least two different x values)
  400.     * must have been added before invoking this method. If this method is
  401.     * invoked before a model can be estimated, <code>Double.NaN</code> is
  402.     * returned.
  403.     * </li></ul>
  404.     *
  405.     * @return the slope of the regression line
  406.     */
  407.     public double getSlope() {
  408.         if (n < 2) {
  409.             return Double.NaN; //not enough data
  410.         }
  411.         if (JdkMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
  412.             return Double.NaN; //not enough variation in x
  413.         }
  414.         return sumXY / sumXX;
  415.     }

  416.     /**
  417.      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
  418.      * sum of squared errors</a> (SSE) associated with the regression
  419.      * model.
  420.      * <p>
  421.      * The sum is computed using the computational formula</p>
  422.      * <p>
  423.      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
  424.      * <p>
  425.      * where <code>SYY</code> is the sum of the squared deviations of the y
  426.      * values about their mean, <code>SXX</code> is similarly defined and
  427.      * <code>SXY</code> is the sum of the products of x and y mean deviations.
  428.      * </p><p>
  429.      * The sums are accumulated using the updating algorithm referenced in
  430.      * {@link #addData}.</p>
  431.      * <p>
  432.      * The return value is constrained to be non-negative - i.e., if due to
  433.      * rounding errors the computational formula returns a negative result,
  434.      * 0 is returned.</p>
  435.      * <p>
  436.      * <strong>Preconditions</strong>: <ul>
  437.      * <li>At least two observations (with at least two different x values)
  438.      * must have been added before invoking this method. If this method is
  439.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  440.      * returned.
  441.      * </li></ul>
  442.      *
  443.      * @return sum of squared errors associated with the regression model
  444.      */
  445.     public double getSumSquaredErrors() {
  446.         return JdkMath.max(0d, sumYY - sumXY * sumXY / sumXX);
  447.     }

  448.     /**
  449.      * Returns the sum of squared deviations of the y values about their mean.
  450.      * <p>
  451.      * This is defined as SSTO
  452.      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
  453.      * <p>
  454.      * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
  455.      *
  456.      * @return sum of squared deviations of y values
  457.      */
  458.     public double getTotalSumSquares() {
  459.         if (n < 2) {
  460.             return Double.NaN;
  461.         }
  462.         return sumYY;
  463.     }

  464.     /**
  465.      * Returns the sum of squared deviations of the x values about their mean.
  466.      *
  467.      * If <code>n &lt; 2</code>, this returns <code>Double.NaN</code>.
  468.      *
  469.      * @return sum of squared deviations of x values
  470.      */
  471.     public double getXSumSquares() {
  472.         if (n < 2) {
  473.             return Double.NaN;
  474.         }
  475.         return sumXX;
  476.     }

  477.     /**
  478.      * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
  479.      *
  480.      * @return sum of cross products
  481.      */
  482.     public double getSumOfCrossProducts() {
  483.         return sumXY;
  484.     }

  485.     /**
  486.      * Returns the sum of squared deviations of the predicted y values about
  487.      * their mean (which equals the mean of y).
  488.      * <p>
  489.      * This is usually abbreviated SSR or SSM.  It is defined as SSM
  490.      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
  491.      * <p>
  492.      * <strong>Preconditions</strong>: <ul>
  493.      * <li>At least two observations (with at least two different x values)
  494.      * must have been added before invoking this method. If this method is
  495.      * invoked before a model can be estimated, <code>Double.NaN</code> is
  496.      * returned.
  497.      * </li></ul>
  498.      *
  499.      * @return sum of squared deviations of predicted y values
  500.      */
  501.     public double getRegressionSumSquares() {
  502.         return getRegressionSumSquares(getSlope());
  503.     }

  504.     /**
  505.      * Returns the sum of squared errors divided by the degrees of freedom,
  506.      * usually abbreviated MSE.
  507.      * <p>
  508.      * If there are fewer than <strong>three</strong> data pairs in the model,
  509.      * or if there is no variation in <code>x</code>, this returns
  510.      * <code>Double.NaN</code>.</p>
  511.      *
  512.      * @return sum of squared deviations of y values
  513.      */
  514.     public double getMeanSquareError() {
  515.         if (n < 3) {
  516.             return Double.NaN;
  517.         }
  518.         return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
  519.     }

  520.     /**
  521.      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
  522.      * Pearson's product moment correlation coefficient</a>,
  523.      * usually denoted r.
  524.      * <p>
  525.      * <strong>Preconditions</strong>: <ul>
  526.      * <li>At least two observations (with at least two different x values)
  527.      * must have been added before invoking this method. If this method is
  528.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  529.      * returned.
  530.      * </li></ul>
  531.      *
  532.      * @return Pearson's r
  533.      */
  534.     public double getR() {
  535.         double b1 = getSlope();
  536.         double result = JdkMath.sqrt(getRSquare());
  537.         if (b1 < 0) {
  538.             result = -result;
  539.         }
  540.         return result;
  541.     }

  542.     /**
  543.      * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
  544.      * coefficient of determination</a>,
  545.      * usually denoted r-square.
  546.      * <p>
  547.      * <strong>Preconditions</strong>: <ul>
  548.      * <li>At least two observations (with at least two different x values)
  549.      * must have been added before invoking this method. If this method is
  550.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  551.      * returned.
  552.      * </li></ul>
  553.      *
  554.      * @return r-square
  555.      */
  556.     public double getRSquare() {
  557.         double ssto = getTotalSumSquares();
  558.         return (ssto - getSumSquaredErrors()) / ssto;
  559.     }

  560.     /**
  561.      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
  562.      * standard error of the intercept estimate</a>,
  563.      * usually denoted s(b0).
  564.      * <p>
  565.      * If there are fewer that <strong>three</strong> observations in the
  566.      * model, or if there is no variation in x, this returns
  567.      * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
  568.      * returned when the intercept is constrained to be zero
  569.      *
  570.      * @return standard error associated with intercept estimate
  571.      */
  572.     public double getInterceptStdErr() {
  573.         if( !hasIntercept ){
  574.             return Double.NaN;
  575.         }
  576.         return JdkMath.sqrt(
  577.             getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
  578.     }

  579.     /**
  580.      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
  581.      * error of the slope estimate</a>,
  582.      * usually denoted s(b1).
  583.      * <p>
  584.      * If there are fewer that <strong>three</strong> data pairs in the model,
  585.      * or if there is no variation in x, this returns <code>Double.NaN</code>.
  586.      * </p>
  587.      *
  588.      * @return standard error associated with slope estimate
  589.      */
  590.     public double getSlopeStdErr() {
  591.         return JdkMath.sqrt(getMeanSquareError() / sumXX);
  592.     }

  593.     /**
  594.      * Returns the half-width of a 95% confidence interval for the slope
  595.      * estimate.
  596.      * <p>
  597.      * The 95% confidence interval is</p>
  598.      * <p>
  599.      * <code>(getSlope() - getSlopeConfidenceInterval(),
  600.      * getSlope() + getSlopeConfidenceInterval())</code></p>
  601.      * <p>
  602.      * If there are fewer that <strong>three</strong> observations in the
  603.      * model, or if there is no variation in x, this returns
  604.      * <code>Double.NaN</code>.</p>
  605.      * <p>
  606.      * <strong>Usage Note</strong>:<br>
  607.      * The validity of this statistic depends on the assumption that the
  608.      * observations included in the model are drawn from a
  609.      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
  610.      * Bivariate Normal Distribution</a>.</p>
  611.      *
  612.      * @return half-width of 95% confidence interval for the slope estimate
  613.      * @throws OutOfRangeException if the confidence interval can not be computed.
  614.      */
  615.     public double getSlopeConfidenceInterval() throws OutOfRangeException {
  616.         return getSlopeConfidenceInterval(0.05d);
  617.     }

  618.     /**
  619.      * Returns the half-width of a (100-100*alpha)% confidence interval for
  620.      * the slope estimate.
  621.      * <p>
  622.      * The (100-100*alpha)% confidence interval is </p>
  623.      * <p>
  624.      * <code>(getSlope() - getSlopeConfidenceInterval(),
  625.      * getSlope() + getSlopeConfidenceInterval())</code></p>
  626.      * <p>
  627.      * To request, for example, a 99% confidence interval, use
  628.      * <code>alpha = .01</code></p>
  629.      * <p>
  630.      * <strong>Usage Note</strong>:<br>
  631.      * The validity of this statistic depends on the assumption that the
  632.      * observations included in the model are drawn from a
  633.      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
  634.      * Bivariate Normal Distribution</a>.</p>
  635.      * <p>
  636.      * <strong> Preconditions:</strong><ul>
  637.      * <li>If there are fewer that <strong>three</strong> observations in the
  638.      * model, or if there is no variation in x, this returns
  639.      * <code>Double.NaN</code>.
  640.      * </li>
  641.      * <li>{@code (0 < alpha < 1)}; otherwise an
  642.      * <code>OutOfRangeException</code> is thrown.
  643.      * </li></ul>
  644.      *
  645.      * @param alpha the desired significance level
  646.      * @return half-width of 95% confidence interval for the slope estimate
  647.      * @throws OutOfRangeException if the confidence interval can not be computed.
  648.      */
  649.     public double getSlopeConfidenceInterval(final double alpha)
  650.     throws OutOfRangeException {
  651.         if (n < 3) {
  652.             return Double.NaN;
  653.         }
  654.         if (alpha >= 1 || alpha <= 0) {
  655.             throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
  656.                                           alpha, 0, 1);
  657.         }
  658.         // No advertised NotStrictlyPositiveException here - will return NaN above
  659.         TDistribution distribution = TDistribution.of(n - 2d);
  660.         return getSlopeStdErr() *
  661.             distribution.inverseSurvivalProbability(alpha / 2d);
  662.     }

  663.     /**
  664.      * Returns the significance level of the slope (equiv) correlation.
  665.      * <p>
  666.      * Specifically, the returned value is the smallest <code>alpha</code>
  667.      * such that the slope confidence interval with significance level
  668.      * equal to <code>alpha</code> does not include <code>0</code>.
  669.      * On regression output, this is often denoted {@code Prob(|t| > 0)}
  670.      * </p><p>
  671.      * <strong>Usage Note</strong>:<br>
  672.      * The validity of this statistic depends on the assumption that the
  673.      * observations included in the model are drawn from a
  674.      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
  675.      * Bivariate Normal Distribution</a>.</p>
  676.      * <p>
  677.      * If there are fewer that <strong>three</strong> observations in the
  678.      * model, or if there is no variation in x, this returns
  679.      * <code>Double.NaN</code>.</p>
  680.      *
  681.      * @return significance level for slope/correlation
  682.      * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
  683.      * if the significance level can not be computed.
  684.      */
  685.     public double getSignificance() {
  686.         if (n < 3) {
  687.             return Double.NaN;
  688.         }
  689.         // No advertised NotStrictlyPositiveException here - will return NaN above
  690.         TDistribution distribution = TDistribution.of(n - 2d);
  691.         return 2d * (distribution.survivalProbability(
  692.                     JdkMath.abs(getSlope()) / getSlopeStdErr()));
  693.     }

  694.     // ---------------------Private methods-----------------------------------

  695.     /**
  696.     * Returns the intercept of the estimated regression line, given the slope.
  697.     * <p>
  698.     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
  699.     *
  700.     * @param slope current slope
  701.     * @return the intercept of the regression line
  702.     */
  703.     private double getIntercept(final double slope) {
  704.       if( hasIntercept){
  705.         return (sumY - slope * sumX) / n;
  706.       }
  707.       return 0.0;
  708.     }

  709.     /**
  710.      * Computes SSR from b1.
  711.      *
  712.      * @param slope regression slope estimate
  713.      * @return sum of squared deviations of predicted y values
  714.      */
  715.     private double getRegressionSumSquares(final double slope) {
  716.         return slope * slope * sumXX;
  717.     }

  718.     /**
  719.      * Performs a regression on data present in buffers and outputs a RegressionResults object.
  720.      *
  721.      * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
  722.      * a {@code NoDataException} is thrown.  If there is no intercept term, the model must
  723.      * contain at least 2 observations.</p>
  724.      *
  725.      * @return RegressionResults acts as a container of regression output
  726.      * @throws ModelSpecificationException if the model is not correctly specified
  727.      * @throws NoDataException if there is not sufficient data in the model to
  728.      * estimate the regression parameters
  729.      */
  730.     @Override
  731.     public RegressionResults regress() throws ModelSpecificationException, NoDataException {
  732.         if (hasIntercept) {
  733.             if (n < 3) {
  734.                 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
  735.             }
  736.             if (JdkMath.abs(sumXX) > Precision.SAFE_MIN) {
  737.                 final double[] params = new double[] { getIntercept(), getSlope() };
  738.                 final double mse = getMeanSquareError();
  739.                 final double syy = sumYY + sumY * sumY / n;
  740.                 final double[] vcv = new double[] { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
  741.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, syy, getSumSquaredErrors(), true,
  742.                         false);
  743.             } else {
  744.                 final double[] params = new double[] { sumY / n, Double.NaN };
  745.                 // final double mse = getMeanSquareError();
  746.                 final double[] vcv = new double[] { ybar / (n - 1.0), Double.NaN, Double.NaN };
  747.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
  748.                         false);
  749.             }
  750.         } else {
  751.             if (n < 2) {
  752.                 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
  753.             }
  754.             if (!Double.isNaN(sumXX)) {
  755.                 final double[] vcv = new double[] { getMeanSquareError() / sumXX };
  756.                 final double[] params = new double[] { sumXY / sumXX };
  757.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
  758.                         false);
  759.             } else {
  760.                 final double[] vcv = new double[] { Double.NaN };
  761.                 final double[] params = new double[] { Double.NaN };
  762.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
  763.                         false);
  764.             }
  765.         }
  766.     }

  767.     /**
  768.      * Performs a regression on data present in buffers including only regressors.
  769.      * indexed in variablesToInclude and outputs a RegressionResults object
  770.      * @param variablesToInclude an array of indices of regressors to include
  771.      * @return RegressionResults acts as a container of regression output
  772.      * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
  773.      * @throws OutOfRangeException if a requested variable is not present in model
  774.      */
  775.     @Override
  776.     public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
  777.         if( variablesToInclude == null || variablesToInclude.length == 0){
  778.           throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
  779.         }
  780.         if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
  781.             throw new ModelSpecificationException(
  782.                     LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
  783.                     (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
  784.         }

  785.         if( hasIntercept ){
  786.             if( variablesToInclude.length == 2 ){
  787.                 if( variablesToInclude[0] == 1 ){
  788.                     throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
  789.                 }else if( variablesToInclude[0] != 0 ){
  790.                     throw new OutOfRangeException( variablesToInclude[0], 0,1 );
  791.                 }
  792.                 if( variablesToInclude[1] != 1){
  793.                      throw new OutOfRangeException( variablesToInclude[0], 0,1 );
  794.                 }
  795.                 return regress();
  796.             }else{
  797.                 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
  798.                      throw new OutOfRangeException( variablesToInclude[0],0,1 );
  799.                 }
  800.                 final double mean = sumY * sumY / n;
  801.                 final double syy = sumYY + mean;
  802.                 if( variablesToInclude[0] == 0 ){
  803.                     //just the mean
  804.                     final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
  805.                     final double[] params = new double[]{ ybar };
  806.                     return new RegressionResults(
  807.                       params, new double[][]{vcv}, true, n, 1,
  808.                       sumY, syy+mean, sumYY,true,false);
  809.                 }else if( variablesToInclude[0] == 1){
  810.                     //final double _syy = sumYY + sumY * sumY / ((double) n);
  811.                     final double sxx = sumXX + sumX * sumX / n;
  812.                     final double sxy = sumXY + sumX * sumY / n;
  813.                     final double sse = JdkMath.max(0d, syy - sxy * sxy / sxx);
  814.                     final double mse = sse/((n-1));
  815.                     if( !Double.isNaN(sxx) ){
  816.                         final double[] vcv = new double[]{ mse / sxx };
  817.                         final double[] params = new double[]{ sxy/sxx };
  818.                         return new RegressionResults(
  819.                                     params, new double[][]{vcv}, true, n, 1,
  820.                                     sumY, syy, sse,false,false);
  821.                     }else{
  822.                         final double[] vcv = new double[]{Double.NaN };
  823.                         final double[] params = new double[]{ Double.NaN };
  824.                         return new RegressionResults(
  825.                                     params, new double[][]{vcv}, true, n, 1,
  826.                                     Double.NaN, Double.NaN, Double.NaN,false,false);
  827.                     }
  828.                 }
  829.             }
  830.         }else{
  831.             if( variablesToInclude[0] != 0 ){
  832.                 throw new OutOfRangeException(variablesToInclude[0],0,0);
  833.             }
  834.             return regress();
  835.         }

  836.         return null;
  837.     }
  838. }