1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math.random;
19
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.FileReader;
23 import java.io.IOException;
24 import java.io.InputStreamReader;
25 import java.io.Serializable;
26 import java.net.URL;
27 import java.util.ArrayList;
28 import java.util.List;
29
30 import org.apache.commons.math.exception.MathIllegalArgumentException;
31 import org.apache.commons.math.exception.MathIllegalStateException;
32 import org.apache.commons.math.exception.NullArgumentException;
33 import org.apache.commons.math.exception.ZeroException;
34 import org.apache.commons.math.exception.util.LocalizedFormats;
35 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
36 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
37 import org.apache.commons.math.util.FastMath;
38 import org.apache.commons.math.util.MathUtils;
39
40 /**
41 * Implements <code>EmpiricalDistribution</code> interface. This implementation
42 * uses what amounts to the
43 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
44 * Variable Kernel Method</a> with Gaussian smoothing:<p>
45 * <strong>Digesting the input file</strong>
46 * <ol><li>Pass the file once to compute min and max.</li>
47 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
48 * <li>Pass the data file again, computing bin counts and univariate
49 * statistics (mean, std dev.) for each of the bins </li>
50 * <li>Divide the interval (0,1) into subintervals associated with the bins,
51 * with the length of a bin's subinterval proportional to its count.</li></ol>
52 * <strong>Generating random values from the distribution</strong><ol>
53 * <li>Generate a uniformly distributed value in (0,1) </li>
54 * <li>Select the subinterval to which the value belongs.
55 * <li>Generate a random Gaussian value with mean = mean of the associated
56 * bin and std dev = std dev of associated bin.</li></ol></p><p>
57 *<strong>USAGE NOTES:</strong><ul>
58 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
59 * is to set the bin count to approximately the length of the input file divided
60 * by 10. </li>
61 *<li>The input file <i>must</i> be a plain text file containing one valid numeric
62 * entry per line.</li>
63 * </ul></p>
64 *
65 * @version $Id: EmpiricalDistributionImpl.java 1179951 2011-10-07 07:33:21Z luc $
66 */
67 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
68
69 /** Default bin count */
70 public static final int DEFAULT_BIN_COUNT = 1000;
71
72 /** Serializable version identifier */
73 private static final long serialVersionUID = 5729073523949762654L;
74
75 /** List of SummaryStatistics objects characterizing the bins */
76 private final List<SummaryStatistics> binStats;
77
78 /** Sample statistics */
79 private SummaryStatistics sampleStats = null;
80
81 /** Max loaded value */
82 private double max = Double.NEGATIVE_INFINITY;
83
84 /** Min loaded value */
85 private double min = Double.POSITIVE_INFINITY;
86
87 /** Grid size */
88 private double delta = 0d;
89
90 /** number of bins */
91 private final int binCount;
92
93 /** is the distribution loaded? */
94 private boolean loaded = false;
95
96 /** upper bounds of subintervals in (0,1) "belonging" to the bins */
97 private double[] upperBounds = null;
98
99 /** RandomDataImpl instance to use in repeated calls to getNext() */
100 private final RandomDataImpl randomData;
101
102 /**
103 * Creates a new EmpiricalDistribution with the default bin count.
104 */
105 public EmpiricalDistributionImpl() {
106 this(DEFAULT_BIN_COUNT, new RandomDataImpl());
107 }
108
109 /**
110 * Creates a new EmpiricalDistribution with the specified bin count.
111 *
112 * @param binCount number of bins
113 */
114 public EmpiricalDistributionImpl(int binCount) {
115 this(binCount, new RandomDataImpl());
116 }
117
118 /**
119 * Creates a new EmpiricalDistribution with the specified bin count using the
120 * provided {@link RandomGenerator} as the source of random data.
121 *
122 * @param binCount number of bins
123 * @param generator random data generator (may be null, resulting in default JDK generator)
124 * @since 3.0
125 */
126 public EmpiricalDistributionImpl(int binCount, RandomGenerator generator) {
127 this.binCount = binCount;
128 randomData = new RandomDataImpl(generator);
129 binStats = new ArrayList<SummaryStatistics>();
130 }
131
132 /**
133 * Creates a new EmpiricalDistribution with default bin count using the
134 * provided {@link RandomGenerator} as the source of random data.
135 *
136 * @param generator random data generator (may be null, resulting in default JDK generator)
137 * @since 3.0
138 */
139 public EmpiricalDistributionImpl(RandomGenerator generator) {
140 this(DEFAULT_BIN_COUNT, generator);
141 }
142
143 /**
144 * Creates a new EmpiricalDistribution with the specified bin count using the
145 * provided {@link RandomDataImpl} instance as the source of random data.
146 *
147 * @param binCount number of bins
148 * @param randomData random data generator (may be null, resulting in default JDK generator)
149 * @since 3.0
150 */
151 public EmpiricalDistributionImpl(int binCount, RandomDataImpl randomData) {
152 this.binCount = binCount;
153 this.randomData = randomData;
154 binStats = new ArrayList<SummaryStatistics>();
155 }
156
157 /**
158 * Creates a new EmpiricalDistribution with default bin count using the
159 * provided {@link RandomDataImpl} as the source of random data.
160 *
161 * @param randomData random data generator (may be null, resulting in default JDK generator)
162 * @since 3.0
163 */
164 public EmpiricalDistributionImpl(RandomDataImpl randomData) {
165 this(DEFAULT_BIN_COUNT, randomData);
166 }
167
168 /**
169 * Computes the empirical distribution from the provided
170 * array of numbers.
171 *
172 * @param in the input data array
173 * @exception NullArgumentException if in is null
174 */
175 public void load(double[] in) throws NullArgumentException {
176 DataAdapter da = new ArrayDataAdapter(in);
177 try {
178 da.computeStats();
179 fillBinStats(in);
180 } catch (IOException e) {
181 throw new MathIllegalStateException(e, LocalizedFormats.SIMPLE_MESSAGE, e.getLocalizedMessage());
182 }
183 loaded = true;
184
185 }
186
187 /**
188 * Computes the empirical distribution using data read from a URL.
189 * @param url url of the input file
190 *
191 * @throws IOException if an IO error occurs
192 * @throws NullArgumentException if url is null
193 */
194 public void load(URL url) throws IOException, NullArgumentException {
195 MathUtils.checkNotNull(url);
196 BufferedReader in =
197 new BufferedReader(new InputStreamReader(url.openStream()));
198 try {
199 DataAdapter da = new StreamDataAdapter(in);
200 da.computeStats();
201 if (sampleStats.getN() == 0) {
202 throw new ZeroException(LocalizedFormats.URL_CONTAINS_NO_DATA, url);
203 }
204 in = new BufferedReader(new InputStreamReader(url.openStream()));
205 fillBinStats(in);
206 loaded = true;
207 } finally {
208 try {
209 in.close();
210 } catch (IOException ex) {
211 // ignore
212 }
213 }
214 }
215
216 /**
217 * Computes the empirical distribution from the input file.
218 *
219 * @param file the input file
220 * @throws IOException if an IO error occurs
221 * @throws NullArgumentException if file is null
222 */
223 public void load(File file) throws IOException, NullArgumentException {
224 MathUtils.checkNotNull(file);
225 BufferedReader in = new BufferedReader(new FileReader(file));
226 try {
227 DataAdapter da = new StreamDataAdapter(in);
228 da.computeStats();
229 in = new BufferedReader(new FileReader(file));
230 fillBinStats(in);
231 loaded = true;
232 } finally {
233 try {
234 in.close();
235 } catch (IOException ex) {
236 // ignore
237 }
238 }
239 }
240
241 /**
242 * Provides methods for computing <code>sampleStats</code> and
243 * <code>beanStats</code> abstracting the source of data.
244 */
245 private abstract class DataAdapter{
246
247 /**
248 * Compute bin stats.
249 *
250 * @throws IOException if an error occurs computing bin stats
251 */
252 public abstract void computeBinStats() throws IOException;
253
254 /**
255 * Compute sample statistics.
256 *
257 * @throws IOException if an error occurs computing sample stats
258 */
259 public abstract void computeStats() throws IOException;
260
261 }
262
263 /**
264 * Factory of <code>DataAdapter</code> objects. For every supported source
265 * of data (array of doubles, file, etc.) an instance of the proper object
266 * is returned.
267 */
268 private class DataAdapterFactory{
269 /**
270 * Creates a DataAdapter from a data object
271 *
272 * @param in object providing access to the data
273 * @return DataAdapter instance
274 */
275 public DataAdapter getAdapter(Object in) {
276 if (in instanceof BufferedReader) {
277 BufferedReader inputStream = (BufferedReader) in;
278 return new StreamDataAdapter(inputStream);
279 } else if (in instanceof double[]) {
280 double[] inputArray = (double[]) in;
281 return new ArrayDataAdapter(inputArray);
282 } else {
283 throw new MathIllegalArgumentException(
284 LocalizedFormats.INPUT_DATA_FROM_UNSUPPORTED_DATASOURCE,
285 in.getClass().getName(),
286 BufferedReader.class.getName(), double[].class.getName());
287 }
288 }
289 }
290 /**
291 * <code>DataAdapter</code> for data provided through some input stream
292 */
293 private class StreamDataAdapter extends DataAdapter{
294
295 /** Input stream providing access to the data */
296 private BufferedReader inputStream;
297
298 /**
299 * Create a StreamDataAdapter from a BufferedReader
300 *
301 * @param in BufferedReader input stream
302 */
303 public StreamDataAdapter(BufferedReader in){
304 super();
305 inputStream = in;
306 }
307
308 /** {@inheritDoc} */
309 @Override
310 public void computeBinStats() throws IOException {
311 String str = null;
312 double val = 0.0d;
313 while ((str = inputStream.readLine()) != null) {
314 val = Double.parseDouble(str);
315 SummaryStatistics stats = binStats.get(findBin(val));
316 stats.addValue(val);
317 }
318
319 inputStream.close();
320 inputStream = null;
321 }
322
323 /** {@inheritDoc} */
324 @Override
325 public void computeStats() throws IOException {
326 String str = null;
327 double val = 0.0;
328 sampleStats = new SummaryStatistics();
329 while ((str = inputStream.readLine()) != null) {
330 val = Double.valueOf(str).doubleValue();
331 sampleStats.addValue(val);
332 }
333 inputStream.close();
334 inputStream = null;
335 }
336 }
337
338 /**
339 * <code>DataAdapter</code> for data provided as array of doubles.
340 */
341 private class ArrayDataAdapter extends DataAdapter {
342
343 /** Array of input data values */
344 private double[] inputArray;
345
346 /**
347 * Construct an ArrayDataAdapter from a double[] array
348 *
349 * @param in double[] array holding the data
350 * @throws NullArgumentException if in is null
351 */
352 public ArrayDataAdapter(double[] in) throws NullArgumentException {
353 super();
354 MathUtils.checkNotNull(in);
355 inputArray = in;
356 }
357
358 /** {@inheritDoc} */
359 @Override
360 public void computeStats() throws IOException {
361 sampleStats = new SummaryStatistics();
362 for (int i = 0; i < inputArray.length; i++) {
363 sampleStats.addValue(inputArray[i]);
364 }
365 }
366
367 /** {@inheritDoc} */
368 @Override
369 public void computeBinStats() throws IOException {
370 for (int i = 0; i < inputArray.length; i++) {
371 SummaryStatistics stats =
372 binStats.get(findBin(inputArray[i]));
373 stats.addValue(inputArray[i]);
374 }
375 }
376 }
377
378 /**
379 * Fills binStats array (second pass through data file).
380 *
381 * @param in object providing access to the data
382 * @throws IOException if an IO error occurs
383 */
384 private void fillBinStats(Object in) throws IOException {
385 // Set up grid
386 min = sampleStats.getMin();
387 max = sampleStats.getMax();
388 delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
389
390 // Initialize binStats ArrayList
391 if (!binStats.isEmpty()) {
392 binStats.clear();
393 }
394 for (int i = 0; i < binCount; i++) {
395 SummaryStatistics stats = new SummaryStatistics();
396 binStats.add(i,stats);
397 }
398
399 // Filling data in binStats Array
400 DataAdapterFactory aFactory = new DataAdapterFactory();
401 DataAdapter da = aFactory.getAdapter(in);
402 da.computeBinStats();
403
404 // Assign upperBounds based on bin counts
405 upperBounds = new double[binCount];
406 upperBounds[0] =
407 ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
408 for (int i = 1; i < binCount-1; i++) {
409 upperBounds[i] = upperBounds[i-1] +
410 ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
411 }
412 upperBounds[binCount-1] = 1.0d;
413 }
414
415 /**
416 * Returns the index of the bin to which the given value belongs
417 *
418 * @param value the value whose bin we are trying to find
419 * @return the index of the bin containing the value
420 */
421 private int findBin(double value) {
422 return FastMath.min(
423 FastMath.max((int) FastMath.ceil((value- min) / delta) - 1, 0),
424 binCount - 1);
425 }
426
427 /**
428 * Generates a random value from this distribution.
429 *
430 * @return the random value.
431 * @throws MathIllegalStateException if the distribution has not been loaded
432 */
433 public double getNextValue() throws MathIllegalStateException {
434
435 if (!loaded) {
436 throw new MathIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
437 }
438
439 // Start with a uniformly distributed random number in (0,1)
440 double x = randomData.nextUniform(0,1);
441
442 // Use this to select the bin and generate a Gaussian within the bin
443 for (int i = 0; i < binCount; i++) {
444 if (x <= upperBounds[i]) {
445 SummaryStatistics stats = binStats.get(i);
446 if (stats.getN() > 0) {
447 if (stats.getStandardDeviation() > 0) { // more than one obs
448 return randomData.nextGaussian
449 (stats.getMean(),stats.getStandardDeviation());
450 } else {
451 return stats.getMean(); // only one obs in bin
452 }
453 }
454 }
455 }
456 throw new MathIllegalStateException(LocalizedFormats.NO_BIN_SELECTED);
457 }
458
459 /**
460 * Returns a {@link StatisticalSummary} describing this distribution.
461 * <strong>Preconditions:</strong><ul>
462 * <li>the distribution must be loaded before invoking this method</li></ul>
463 *
464 * @return the sample statistics
465 * @throws IllegalStateException if the distribution has not been loaded
466 */
467 public StatisticalSummary getSampleStats() {
468 return sampleStats;
469 }
470
471 /**
472 * Returns the number of bins.
473 *
474 * @return the number of bins.
475 */
476 public int getBinCount() {
477 return binCount;
478 }
479
480 /**
481 * Returns a List of {@link SummaryStatistics} instances containing
482 * statistics describing the values in each of the bins. The list is
483 * indexed on the bin number.
484 *
485 * @return List of bin statistics.
486 */
487 public List<SummaryStatistics> getBinStats() {
488 return binStats;
489 }
490
491 /**
492 * <p>Returns a fresh copy of the array of upper bounds for the bins.
493 * Bins are: <br/>
494 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
495 * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
496 *
497 * <p>Note: In versions 1.0-2.0 of commons-math, this method
498 * incorrectly returned the array of probability generator upper
499 * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
500 *
501 * @return array of bin upper bounds
502 * @since 2.1
503 */
504 public double[] getUpperBounds() {
505 double[] binUpperBounds = new double[binCount];
506 binUpperBounds[0] = min + delta;
507 for (int i = 1; i < binCount - 1; i++) {
508 binUpperBounds[i] = binUpperBounds[i-1] + delta;
509 }
510 binUpperBounds[binCount - 1] = max;
511 return binUpperBounds;
512 }
513
514 /**
515 * <p>Returns a fresh copy of the array of upper bounds of the subintervals
516 * of [0,1] used in generating data from the empirical distribution.
517 * Subintervals correspond to bins with lengths proportional to bin counts.</p>
518 *
519 * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
520 * by {@link #getUpperBounds()}.</p>
521 *
522 * @since 2.1
523 * @return array of upper bounds of subintervals used in data generation
524 */
525 public double[] getGeneratorUpperBounds() {
526 int len = upperBounds.length;
527 double[] out = new double[len];
528 System.arraycopy(upperBounds, 0, out, 0, len);
529 return out;
530 }
531
532 /**
533 * Property indicating whether or not the distribution has been loaded.
534 *
535 * @return true if the distribution has been loaded
536 */
537 public boolean isLoaded() {
538 return loaded;
539 }
540
541 /**
542 * Reseeds the random number generator used by {@link #getNextValue()}.
543 *
544 * @param seed random generator seed
545 * @since 3.0
546 */
547 public void reSeed(long seed) {
548 randomData.reSeed(seed);
549 }
550 }