001: /*
002: * Copyright 2003-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.commons.math.stat.regression;
018:
019: import java.io.Serializable;
020:
021: import org.apache.commons.math.MathException;
022: import org.apache.commons.math.distribution.DistributionFactory;
023: import org.apache.commons.math.distribution.TDistribution;
024:
025: /**
026: * Estimates an ordinary least squares regression model
027: * with one independent variable.
028: * <p>
029: * <code> y = intercept + slope * x </code>
030: * <p>
031: * Standard errors for <code>intercept</code> and <code>slope</code> are
032: * available as well as ANOVA, r-square and Pearson's r statistics.
033: * <p>
034: * Observations (x,y pairs) can be added to the model one at a time or they
035: * can be provided in a 2-dimensional array. The observations are not stored
036: * in memory, so there is no limit to the number of observations that can be
037: * added to the model.
038: * <p>
039: * <strong>Usage Notes</strong>: <ul>
040: * <li> When there are fewer than two observations in the model, or when
041: * there is no variation in the x values (i.e. all x values are the same)
042: * all statistics return <code>NaN</code>. At least two observations with
043: * different x coordinates are requred to estimate a bivariate regression
044: * model.
045: * </li>
046: * <li> getters for the statistics always compute values based on the current
047: * set of observations -- i.e., you can get statistics, then add more data
048: * and get updated statistics without using a new instance. There is no
049: * "compute" method that updates all statistics. Each of the getters performs
050: * the necessary computations to return the requested statistic.</li>
051: * </ul>
052: *
053: * @version $Revision: 348519 $ $Date: 2005-11-23 12:12:18 -0700 (Wed, 23 Nov 2005) $
054: */
055: public class SimpleRegression implements Serializable {
056:
057: /** Serializable version identifier */
058: private static final long serialVersionUID = -3004689053607543335L;
059:
060: /** sum of x values */
061: private double sumX = 0d;
062:
063: /** total variation in x (sum of squared deviations from xbar) */
064: private double sumXX = 0d;
065:
066: /** sum of y values */
067: private double sumY = 0d;
068:
069: /** total variation in y (sum of squared deviations from ybar) */
070: private double sumYY = 0d;
071:
072: /** sum of products */
073: private double sumXY = 0d;
074:
075: /** number of observations */
076: private long n = 0;
077:
078: /** mean of accumulated x values, used in updating formulas */
079: private double xbar = 0;
080:
081: /** mean of accumulated y values, used in updating formulas */
082: private double ybar = 0;
083:
084: // ---------------------Public methods--------------------------------------
085:
086: /**
087: * Create an empty SimpleRegression instance
088: */
089: public SimpleRegression() {
090: super ();
091: }
092:
093: /**
094: * Adds the observation (x,y) to the regression data set.
095: * <p>
096: * Uses updating formulas for means and sums of squares defined in
097: * "Algorithms for Computing the Sample Variance: Analysis and
098: * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
099: * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
100: * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985
101: *
102: *
103: * @param x independent variable value
104: * @param y dependent variable value
105: */
106: public void addData(double x, double y) {
107: if (n == 0) {
108: xbar = x;
109: ybar = y;
110: } else {
111: double dx = x - xbar;
112: double dy = y - ybar;
113: sumXX += dx * dx * (double) n / (double) (n + 1.0);
114: sumYY += dy * dy * (double) n / (double) (n + 1.0);
115: sumXY += dx * dy * (double) n / (double) (n + 1.0);
116: xbar += dx / (double) (n + 1.0);
117: ybar += dy / (double) (n + 1.0);
118: }
119: sumX += x;
120: sumY += y;
121: n++;
122: }
123:
124: /**
125: * Adds the observations represented by the elements in
126: * <code>data</code>.
127: * <p>
128: * <code>(data[0][0],data[0][1])</code> will be the first observation, then
129: * <code>(data[1][0],data[1][1])</code>, etc.
130: * <p>
131: * This method does not replace data that has already been added. The
132: * observations represented by <code>data</code> are added to the existing
133: * dataset.
134: * <p>
135: * To replace all data, use <code>clear()</code> before adding the new
136: * data.
137: *
138: * @param data array of observations to be added
139: */
140: public void addData(double[][] data) {
141: for (int i = 0; i < data.length; i++) {
142: addData(data[i][0], data[i][1]);
143: }
144: }
145:
146: /**
147: * Clears all data from the model.
148: */
149: public void clear() {
150: sumX = 0d;
151: sumXX = 0d;
152: sumY = 0d;
153: sumYY = 0d;
154: sumXY = 0d;
155: n = 0;
156: }
157:
158: /**
159: * Returns the number of observations that have been added to the model.
160: *
161: * @return n number of observations that have been added.
162: */
163: public long getN() {
164: return n;
165: }
166:
167: /**
168: * Returns the "predicted" <code>y</code> value associated with the
169: * supplied <code>x</code> value, based on the data that has been
170: * added to the model when this method is activated.
171: * <p>
172: * <code> predict(x) = intercept + slope * x </code>
173: * <p>
174: * <strong>Preconditions</strong>: <ul>
175: * <li>At least two observations (with at least two different x values)
176: * must have been added before invoking this method. If this method is
177: * invoked before a model can be estimated, <code>Double,NaN</code> is
178: * returned.
179: * </li></ul>
180: *
181: * @param x input <code>x</code> value
182: * @return predicted <code>y</code> value
183: */
184: public double predict(double x) {
185: double b1 = getSlope();
186: return getIntercept(b1) + b1 * x;
187: }
188:
189: /**
190: * Returns the intercept of the estimated regression line.
191: * <p>
192: * The least squares estimate of the intercept is computed using the
193: * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
194: * The intercept is sometimes denoted b0.
195: * <p>
196: * <strong>Preconditions</strong>: <ul>
197: * <li>At least two observations (with at least two different x values)
198: * must have been added before invoking this method. If this method is
199: * invoked before a model can be estimated, <code>Double,NaN</code> is
200: * returned.
201: * </li></ul>
202: *
203: * @return the intercept of the regression line
204: */
205: public double getIntercept() {
206: return getIntercept(getSlope());
207: }
208:
209: /**
210: * Returns the slope of the estimated regression line.
211: * <p>
212: * The least squares estimate of the slope is computed using the
213: * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
214: * The slope is sometimes denoted b1.
215: * <p>
216: * <strong>Preconditions</strong>: <ul>
217: * <li>At least two observations (with at least two different x values)
218: * must have been added before invoking this method. If this method is
219: * invoked before a model can be estimated, <code>Double.NaN</code> is
220: * returned.
221: * </li></ul>
222: *
223: * @return the slope of the regression line
224: */
225: public double getSlope() {
226: if (n < 2) {
227: return Double.NaN; //not enough data
228: }
229: if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
230: return Double.NaN; //not enough variation in x
231: }
232: return sumXY / sumXX;
233: }
234:
235: /**
236: * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
237: * sum of squared errors</a> (SSE) associated with the regression
238: * model.
239: * <p>
240: * <strong>Preconditions</strong>: <ul>
241: * <li>At least two observations (with at least two different x values)
242: * must have been added before invoking this method. If this method is
243: * invoked before a model can be estimated, <code>Double,NaN</code> is
244: * returned.
245: * </li></ul>
246: *
247: * @return sum of squared errors associated with the regression model
248: */
249: public double getSumSquaredErrors() {
250: return getSumSquaredErrors(getSlope());
251: }
252:
253: /**
254: * Returns the sum of squared deviations of the y values about their mean.
255: * <p>
256: * This is defined as SSTO
257: * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.
258: * <p>
259: * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
260: *
261: * @return sum of squared deviations of y values
262: */
263: public double getTotalSumSquares() {
264: if (n < 2) {
265: return Double.NaN;
266: }
267: return sumYY;
268: }
269:
270: /**
271: * Returns the sum of squared deviations of the predicted y values about
272: * their mean (which equals the mean of y).
273: * <p>
274: * This is usually abbreviated SSR or SSM. It is defined as SSM
275: * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>
276: * <p>
277: * <strong>Preconditions</strong>: <ul>
278: * <li>At least two observations (with at least two different x values)
279: * must have been added before invoking this method. If this method is
280: * invoked before a model can be estimated, <code>Double.NaN</code> is
281: * returned.
282: * </li></ul>
283: *
284: * @return sum of squared deviations of predicted y values
285: */
286: public double getRegressionSumSquares() {
287: return getRegressionSumSquares(getSlope());
288: }
289:
290: /**
291: * Returns the sum of squared errors divided by the degrees of freedom,
292: * usually abbreviated MSE.
293: * <p>
294: * If there are fewer than <strong>three</strong> data pairs in the model,
295: * or if there is no variation in <code>x</code>, this returns
296: * <code>Double.NaN</code>.
297: *
298: * @return sum of squared deviations of y values
299: */
300: public double getMeanSquareError() {
301: if (n < 3) {
302: return Double.NaN;
303: }
304: return getSumSquaredErrors() / (double) (n - 2);
305: }
306:
307: /**
308: * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
309: * Pearson's product moment correlation coefficient</a>,
310: * usually denoted r.
311: * <p>
312: * <strong>Preconditions</strong>: <ul>
313: * <li>At least two observations (with at least two different x values)
314: * must have been added before invoking this method. If this method is
315: * invoked before a model can be estimated, <code>Double,NaN</code> is
316: * returned.
317: * </li></ul>
318: *
319: * @return Pearson's r
320: */
321: public double getR() {
322: double b1 = getSlope();
323: double result = Math.sqrt(getRSquare(b1));
324: if (b1 < 0) {
325: result = -result;
326: }
327: return result;
328: }
329:
330: /**
331: * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
332: * coefficient of determination</a>,
333: * usually denoted r-square.
334: * <p>
335: * <strong>Preconditions</strong>: <ul>
336: * <li>At least two observations (with at least two different x values)
337: * must have been added before invoking this method. If this method is
338: * invoked before a model can be estimated, <code>Double,NaN</code> is
339: * returned.
340: * </li></ul>
341: *
342: * @return r-square
343: */
344: public double getRSquare() {
345: return getRSquare(getSlope());
346: }
347:
348: /**
349: * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
350: * standard error of the intercept estimate</a>,
351: * usually denoted s(b0).
352: * <p>
353: * If there are fewer that <strong>three</strong> observations in the
354: * model, or if there is no variation in x, this returns
355: * <code>Double.NaN</code>.
356: *
357: * @return standard error associated with intercept estimate
358: */
359: public double getInterceptStdErr() {
360: return Math.sqrt(getMeanSquareError()
361: * ((1d / (double) n) + (xbar * xbar) / sumXX));
362: }
363:
364: /**
365: * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
366: * error of the slope estimate</a>,
367: * usually denoted s(b1).
368: * <p>
369: * If there are fewer that <strong>three</strong> data pairs in the model,
370: * or if there is no variation in x, this returns <code>Double.NaN</code>.
371: *
372: * @return standard error associated with slope estimate
373: */
374: public double getSlopeStdErr() {
375: return Math.sqrt(getMeanSquareError() / sumXX);
376: }
377:
378: /**
379: * Returns the half-width of a 95% confidence interval for the slope
380: * estimate.
381: * <p>
382: * The 95% confidence interval is
383: * <p>
384: * <code>(getSlope() - getSlopeConfidenceInterval(),
385: * getSlope() + getSlopeConfidenceInterval())</code>
386: * <p>
387: * If there are fewer that <strong>three</strong> observations in the
388: * model, or if there is no variation in x, this returns
389: * <code>Double.NaN</code>.
390: * <p>
391: * <strong>Usage Note</strong>:<br>
392: * The validity of this statistic depends on the assumption that the
393: * observations included in the model are drawn from a
394: * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
395: * Bivariate Normal Distribution</a>.
396: *
397: * @return half-width of 95% confidence interval for the slope estimate
398: *
399: * @throws MathException if the confidence interval can not be computed.
400: */
401: public double getSlopeConfidenceInterval() throws MathException {
402: return getSlopeConfidenceInterval(0.05d);
403: }
404:
405: /**
406: * Returns the half-width of a (100-100*alpha)% confidence interval for
407: * the slope estimate.
408: * <p>
409: * The (100-100*alpha)% confidence interval is
410: * <p>
411: * <code>(getSlope() - getSlopeConfidenceInterval(),
412: * getSlope() + getSlopeConfidenceInterval())</code>
413: * <p>
414: * To request, for example, a 99% confidence interval, use
415: * <code>alpha = .01</code>
416: * <p>
417: * <strong>Usage Note</strong>:<br>
418: * The validity of this statistic depends on the assumption that the
419: * observations included in the model are drawn from a
420: * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
421: * Bivariate Normal Distribution</a>.
422: * <p>
423: * <strong> Preconditions:</strong><ul>
424: * <li>If there are fewer that <strong>three</strong> observations in the
425: * model, or if there is no variation in x, this returns
426: * <code>Double.NaN</code>.
427: * </li>
428: * <li><code>(0 < alpha < 1)</code>; otherwise an
429: * <code>IllegalArgumentException</code> is thrown.
430: * </li></ul>
431: *
432: * @param alpha the desired significance level
433: * @return half-width of 95% confidence interval for the slope estimate
434: * @throws MathException if the confidence interval can not be computed.
435: */
436: public double getSlopeConfidenceInterval(double alpha)
437: throws MathException {
438: if (alpha >= 1 || alpha <= 0) {
439: throw new IllegalArgumentException();
440: }
441: return getSlopeStdErr()
442: * getTDistribution().inverseCumulativeProbability(
443: 1d - alpha / 2d);
444: }
445:
446: /**
447: * Returns the significance level of the slope (equiv) correlation.
448: * <p>
449: * Specifically, the returned value is the smallest <code>alpha</code>
450: * such that the slope confidence interval with significance level
451: * equal to <code>alpha</code> does not include <code>0</code>.
452: * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
453: * <p>
454: * <strong>Usage Note</strong>:<br>
455: * The validity of this statistic depends on the assumption that the
456: * observations included in the model are drawn from a
457: * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
458: * Bivariate Normal Distribution</a>.
459: * <p>
460: * If there are fewer that <strong>three</strong> observations in the
461: * model, or if there is no variation in x, this returns
462: * <code>Double.NaN</code>.
463: *
464: * @return significance level for slope/correlation
465: * @throws MathException if the significance level can not be computed.
466: */
467: public double getSignificance() throws MathException {
468: return 2d * (1.0 - getTDistribution().cumulativeProbability(
469: Math.abs(getSlope()) / getSlopeStdErr()));
470: }
471:
472: // ---------------------Private methods-----------------------------------
473:
474: /**
475: * Returns the intercept of the estimated regression line, given the slope.
476: * <p>
477: * Will return <code>NaN</code> if slope is <code>NaN</code>.
478: *
479: * @param slope current slope
480: * @return the intercept of the regression line
481: */
482: private double getIntercept(double slope) {
483: return (sumY - slope * sumX) / ((double) n);
484: }
485:
486: /**
487: * Returns the sum of squared errors associated with the regression
488: * model, using the slope of the regression line.
489: * <p>
490: * Returns NaN if the slope is NaN.
491: *
492: * @param b1 current slope
493: * @return sum of squared errors associated with the regression model
494: */
495: private double getSumSquaredErrors(double b1) {
496: return sumYY - sumXY * sumXY / sumXX;
497: }
498:
499: /**
500: * Computes r-square from the slope.
501: * <p>
502: * will return NaN if slope is Nan.
503: *
504: * @param b1 current slope
505: * @return r-square
506: */
507: private double getRSquare(double b1) {
508: double ssto = getTotalSumSquares();
509: return (ssto - getSumSquaredErrors(b1)) / ssto;
510: }
511:
512: /**
513: * Computes SSR from b1.
514: *
515: * @param slope regression slope estimate
516: * @return sum of squared deviations of predicted y values
517: */
518: private double getRegressionSumSquares(double slope) {
519: return slope * slope * sumXX;
520: }
521:
522: /**
523: * Uses distribution framework to get a t distribution instance
524: * with df = n - 2
525: *
526: * @return t distribution with df = n - 2
527: */
528: private TDistribution getTDistribution() {
529: return DistributionFactory.newInstance().createTDistribution(
530: n - 2);
531: }
532: }
|