001: /*
002: * Copyright 2003-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.commons.math.random;
018:
019: import java.io.Serializable;
020: import java.io.BufferedReader;
021: import java.io.FileReader;
022: import java.io.File;
023: import java.io.IOException;
024: import java.io.InputStreamReader;
025: import java.net.URL;
026: import java.util.ArrayList;
027: import java.util.List;
028:
029: import org.apache.commons.math.stat.descriptive.SummaryStatistics;
030: import org.apache.commons.math.stat.descriptive.StatisticalSummary;
031:
032: /**
033: * Implements <code>EmpiricalDistribution</code> interface. This implementation
034: * uses what amounts to the
035: * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
036: * Variable Kernel Method</a> with Gaussian smoothing:<p>
037: * <strong>Digesting the input file</strong>
038: * <ol><li>Pass the file once to compute min and max.</li>
039: * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
040: * <li>Pass the data file again, computing bin counts and univariate
041: * statistics (mean, std dev.) for each of the bins </li>
042: * <li>Divide the interval (0,1) into subintervals associated with the bins,
043: * with the length of a bin's subinterval proportional to its count.</li></ol>
044: * <strong>Generating random values from the distribution</strong><ol>
045: * <li>Generate a uniformly distributed value in (0,1) </li>
046: * <li>Select the subinterval to which the value belongs.
047: * <li>Generate a random Gaussian value with mean = mean of the associated
048: * bin and std dev = std dev of associated bin.</li></ol></p><p>
049: *<strong>USAGE NOTES:</strong><ul>
050: *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
051: * is to set the bin count to approximately the length of the input file divided
052: * by 10. </li>
053: *<li>The input file <i>must</i> be a plain text file containing one valid numeric
054: * entry per line.</li>
055: * </ul></p>
056: *
057: * @version $Revision: 348894 $ $Date: 2005-11-24 23:34:47 -0700 (Thu, 24 Nov 2005) $
058: */
059: public class EmpiricalDistributionImpl implements Serializable,
060: EmpiricalDistribution {
061:
062: /** Serializable version identifier */
063: private static final long serialVersionUID = -6773236347582113490L;
064:
065: /** List of SummaryStatistics objects characterizing the bins */
066: private ArrayList binStats = null;
067:
068: /** Sample statistics */
069: private SummaryStatistics sampleStats = null;
070:
071: /** number of bins */
072: private int binCount = 1000;
073:
074: /** is the distribution loaded? */
075: private boolean loaded = false;
076:
077: /** upper bounds of subintervals in (0,1) "belonging" to the bins */
078: private double[] upperBounds = null;
079:
080: /** RandomData instance to use in repeated calls to getNext() */
081: private RandomData randomData = new RandomDataImpl();
082:
083: /**
084: * Creates a new EmpiricalDistribution with the default bin count.
085: */
086: public EmpiricalDistributionImpl() {
087: binStats = new ArrayList();
088: }
089:
090: /**
091: * Creates a new EmpiricalDistribution with the specified bin count.
092: *
093: * @param binCount number of bins
094: */
095: public EmpiricalDistributionImpl(int binCount) {
096: this .binCount = binCount;
097: binStats = new ArrayList();
098: }
099:
100: /**
101: * Computes the empirical distribution from the provided
102: * array of numbers.
103: *
104: * @param in the input data array
105: */
106: public void load(double[] in) {
107: DataAdapter da = new ArrayDataAdapter(in);
108: try {
109: da.computeStats();
110: fillBinStats(in);
111: } catch (Exception e) {
112: throw new RuntimeException(e.getMessage());
113: }
114: loaded = true;
115:
116: }
117:
118: /**
119: * Computes the empirical distribution using data read from a URL.
120: * @param url url of the input file
121: *
122: * @throws IOException if an IO error occurs
123: */
124: public void load(URL url) throws IOException {
125: BufferedReader in = new BufferedReader(new InputStreamReader(
126: url.openStream()));
127: try {
128: DataAdapter da = new StreamDataAdapter(in);
129: try {
130: da.computeStats();
131: } catch (Exception e) {
132: throw new IOException(e.getMessage());
133: }
134: in = new BufferedReader(new InputStreamReader(url
135: .openStream()));
136: fillBinStats(in);
137: loaded = true;
138: } finally {
139: if (in != null) {
140: try {
141: in.close();
142: } catch (Exception ex) {
143: // ignore
144: }
145: }
146: }
147: }
148:
149: /**
150: * Computes the empirical distribution from the input file.
151: *
152: * @param file the input file
153: * @throws IOException if an IO error occurs
154: */
155: public void load(File file) throws IOException {
156: BufferedReader in = new BufferedReader(new FileReader(file));
157: try {
158: DataAdapter da = new StreamDataAdapter(in);
159: try {
160: da.computeStats();
161: } catch (Exception e) {
162: throw new IOException(e.getMessage());
163: }
164: in = new BufferedReader(new FileReader(file));
165: fillBinStats(in);
166: loaded = true;
167: } finally {
168: if (in != null) {
169: try {
170: in.close();
171: } catch (Exception ex) {
172: // ignore
173: }
174: }
175: }
176: }
177:
178: /**
179: * Provides methods for computing <code>sampleStats</code> and
180: * <code>beanStats</code> abstracting the source of data.
181: */
182: private abstract class DataAdapter {
183: /**
184: * Compute bin stats.
185: *
186: * @param min minimum value
187: * @param delta grid size
188: * @throws Exception if an error occurs computing bin stats
189: */
190: public abstract void computeBinStats(double min, double delta)
191: throws Exception;
192:
193: /**
194: * Compute sample statistics.
195: *
196: * @throws Exception if an error occurs computing sample stats
197: */
198: public abstract void computeStats() throws Exception;
199: }
200:
201: /**
202: * Factory of <code>DataAdapter</code> objects. For every supported source
203: * of data (array of doubles, file, etc.) an instance of the proper object
204: * is returned.
205: */
206: private class DataAdapterFactory {
207: /**
208: * Creates a DataAdapter from a data object
209: *
210: * @param in object providing access to the data
211: * @return DataAdapter instance
212: */
213: public DataAdapter getAdapter(Object in) {
214: if (in instanceof BufferedReader) {
215: BufferedReader inputStream = (BufferedReader) in;
216: return new StreamDataAdapter(inputStream);
217: } else if (in instanceof double[]) {
218: double[] inputArray = (double[]) in;
219: return new ArrayDataAdapter(inputArray);
220: } else {
221: throw new IllegalArgumentException(
222: "Input data comes from the"
223: + " unsupported source");
224: }
225: }
226: }
227:
228: /**
229: * <code>DataAdapter</code> for data provided through some input stream
230: */
231: private class StreamDataAdapter extends DataAdapter {
232:
233: /** Input stream providng access to the data */
234: private BufferedReader inputStream;
235:
236: /**
237: * Create a StreamDataAdapter from a BufferedReader
238: *
239: * @param in BufferedReader input stream
240: */
241: public StreamDataAdapter(BufferedReader in) {
242: super ();
243: inputStream = in;
244: }
245:
246: /**
247: * Computes binStats
248: *
249: * @param min minimum value
250: * @param delta grid size
251: * @throws IOException if an IO error occurs
252: */
253: public void computeBinStats(double min, double delta)
254: throws IOException {
255: String str = null;
256: double val = 0.0d;
257: while ((str = inputStream.readLine()) != null) {
258: val = Double.parseDouble(str);
259: SummaryStatistics stats = (SummaryStatistics) binStats
260: .get(findBin(min, val, delta));
261: stats.addValue(val);
262: }
263:
264: inputStream.close();
265: inputStream = null;
266: }
267:
268: /**
269: * Computes sampleStats
270: *
271: * @throws IOException if an IOError occurs
272: */
273: public void computeStats() throws IOException {
274: String str = null;
275: double val = 0.0;
276: sampleStats = SummaryStatistics.newInstance();
277: while ((str = inputStream.readLine()) != null) {
278: val = new Double(str).doubleValue();
279: sampleStats.addValue(val);
280: }
281: inputStream.close();
282: inputStream = null;
283: }
284: }
285:
286: /**
287: * <code>DataAdapter</code> for data provided as array of doubles.
288: */
289: private class ArrayDataAdapter extends DataAdapter {
290:
291: /** Array of input data values */
292: private double[] inputArray;
293:
294: /**
295: * Construct an ArrayDataAdapter from a double[] array
296: *
297: * @param in double[] array holding the data
298: */
299: public ArrayDataAdapter(double[] in) {
300: super ();
301: inputArray = in;
302: }
303:
304: /**
305: * Computes sampleStats
306: *
307: * @throws IOException if an IO error occurs
308: */
309: public void computeStats() throws IOException {
310: sampleStats = SummaryStatistics.newInstance();
311: for (int i = 0; i < inputArray.length; i++) {
312: sampleStats.addValue(inputArray[i]);
313: }
314: }
315:
316: /**
317: * Computes binStats
318: *
319: * @param min minimum value
320: * @param delta grid size
321: * @throws IOException if an IO error occurs
322: */
323: public void computeBinStats(double min, double delta)
324: throws IOException {
325: for (int i = 0; i < inputArray.length; i++) {
326: SummaryStatistics stats = (SummaryStatistics) binStats
327: .get(findBin(min, inputArray[i], delta));
328: stats.addValue(inputArray[i]);
329: }
330: }
331: }
332:
333: /**
334: * Fills binStats array (second pass through data file).
335: *
336: * @param in object providing access to the data
337: * @throws IOException if an IO error occurs
338: */
339: private void fillBinStats(Object in) throws IOException {
340: // Load array of bin upper bounds -- evenly spaced from min - max
341: double min = sampleStats.getMin();
342: double max = sampleStats.getMax();
343: double delta = (max - min)
344: / (new Double(binCount)).doubleValue();
345: double[] binUpperBounds = new double[binCount];
346: binUpperBounds[0] = min + delta;
347: for (int i = 1; i < binCount - 1; i++) {
348: binUpperBounds[i] = binUpperBounds[i - 1] + delta;
349: }
350: binUpperBounds[binCount - 1] = max;
351:
352: // Initialize binStats ArrayList
353: if (!binStats.isEmpty()) {
354: binStats.clear();
355: }
356: for (int i = 0; i < binCount; i++) {
357: SummaryStatistics stats = SummaryStatistics.newInstance();
358: binStats.add(i, stats);
359: }
360:
361: // Filling data in binStats Array
362: DataAdapterFactory aFactory = new DataAdapterFactory();
363: DataAdapter da = aFactory.getAdapter(in);
364: try {
365: da.computeBinStats(min, delta);
366: } catch (Exception e) {
367: if (e instanceof RuntimeException) {
368: throw new RuntimeException(e.getMessage());
369: } else {
370: throw new IOException(e.getMessage());
371: }
372: }
373:
374: // Assign upperBounds based on bin counts
375: upperBounds = new double[binCount];
376: upperBounds[0] = ((double) ((SummaryStatistics) binStats.get(0))
377: .getN())
378: / (double) sampleStats.getN();
379: for (int i = 1; i < binCount - 1; i++) {
380: upperBounds[i] = upperBounds[i - 1]
381: + ((double) ((SummaryStatistics) binStats.get(i))
382: .getN()) / (double) sampleStats.getN();
383: }
384: upperBounds[binCount - 1] = 1.0d;
385: }
386:
387: /**
388: * Returns the index of the bin to which the given value belongs
389: *
390: * @param min the minimum value
391: * @param value the value whose bin we are trying to find
392: * @param delta the grid size
393: * @return the index of the bin containing the value
394: */
395: private int findBin(double min, double value, double delta) {
396: return Math.min(Math.max(
397: (int) Math.ceil((value - min) / delta) - 1, 0),
398: binCount - 1);
399: }
400:
401: /**
402: * Generates a random value from this distribution.
403: *
404: * @return the random value.
405: * @throws IllegalStateException if the distribution has not been loaded
406: */
407: public double getNextValue() throws IllegalStateException {
408:
409: if (!loaded) {
410: throw new IllegalStateException("distribution not loaded");
411: }
412:
413: // Start with a uniformly distributed random number in (0,1)
414: double x = Math.random();
415:
416: // Use this to select the bin and generate a Gaussian within the bin
417: for (int i = 0; i < binCount; i++) {
418: if (x <= upperBounds[i]) {
419: SummaryStatistics stats = (SummaryStatistics) binStats
420: .get(i);
421: if (stats.getN() > 0) {
422: if (stats.getStandardDeviation() > 0) { // more than one obs
423: return randomData.nextGaussian(stats.getMean(),
424: stats.getStandardDeviation());
425: } else {
426: return stats.getMean(); // only one obs in bin
427: }
428: }
429: }
430: }
431: throw new RuntimeException("No bin selected");
432: }
433:
434: /**
435: * Returns a {@link StatisticalSummary} describing this distribution.
436: * <strong>Preconditions:</strong><ul>
437: * <li>the distribution must be loaded before invoking this method</li></ul>
438: *
439: * @return the sample statistics
440: * @throws IllegalStateException if the distribution has not been loaded
441: */
442: public StatisticalSummary getSampleStats() {
443: return sampleStats;
444: }
445:
446: /**
447: * Returns the number of bins.
448: *
449: * @return the number of bins.
450: */
451: public int getBinCount() {
452: return binCount;
453: }
454:
455: /**
456: * Returns an ArrayList of {@link SummaryStatistics} instances containing
457: * statistics describing the values in each of the bins. The ArrayList is
458: * indexed on the bin number.
459: *
460: * @return List of bin statistics.
461: */
462: public List getBinStats() {
463: return binStats;
464: }
465:
466: /**
467: * Returns (a fresh copy of) the array of upper bounds for the bins.
468: Bins are: <br/>
469: * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
470: * (upperBounds[binCount-1],max]
471: *
472: * @return array of bin upper bounds
473: */
474: public double[] getUpperBounds() {
475: int len = upperBounds.length;
476: double[] out = new double[len];
477: System.arraycopy(upperBounds, 0, out, 0, len);
478: return out;
479: }
480:
481: /**
482: * Property indicating whether or not the distribution has been loaded.
483: *
484: * @return true if the distribution has been loaded
485: */
486: public boolean isLoaded() {
487: return loaded;
488: }
489: }
|