View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math.random;
19  
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.FileReader;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.io.Serializable;
26  import java.net.URL;
27  import java.util.ArrayList;
28  import java.util.List;
29  
30  import org.apache.commons.math.exception.MathIllegalArgumentException;
31  import org.apache.commons.math.exception.MathIllegalStateException;
32  import org.apache.commons.math.exception.NullArgumentException;
33  import org.apache.commons.math.exception.ZeroException;
34  import org.apache.commons.math.exception.util.LocalizedFormats;
35  import org.apache.commons.math.stat.descriptive.StatisticalSummary;
36  import org.apache.commons.math.stat.descriptive.SummaryStatistics;
37  import org.apache.commons.math.util.FastMath;
38  import org.apache.commons.math.util.MathUtils;
39  
40  /**
41   * Implements <code>EmpiricalDistribution</code> interface.  This implementation
42   * uses what amounts to the
43   * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
44   * Variable Kernel Method</a> with Gaussian smoothing:<p>
45   * <strong>Digesting the input file</strong>
46   * <ol><li>Pass the file once to compute min and max.</li>
47   * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
48   * <li>Pass the data file again, computing bin counts and univariate
49   *     statistics (mean, std dev.) for each of the bins </li>
50   * <li>Divide the interval (0,1) into subintervals associated with the bins,
51   *     with the length of a bin's subinterval proportional to its count.</li></ol>
52   * <strong>Generating random values from the distribution</strong><ol>
53   * <li>Generate a uniformly distributed value in (0,1) </li>
54   * <li>Select the subinterval to which the value belongs.
55   * <li>Generate a random Gaussian value with mean = mean of the associated
56   *     bin and std dev = std dev of associated bin.</li></ol></p><p>
57   *<strong>USAGE NOTES:</strong><ul>
58   *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
59   *    is to set the bin count to approximately the length of the input file divided
60   *    by 10. </li>
61   *<li>The input file <i>must</i> be a plain text file containing one valid numeric
62   *    entry per line.</li>
63   * </ul></p>
64   *
65   * @version $Id: EmpiricalDistributionImpl.java 1179951 2011-10-07 07:33:21Z luc $
66   */
67  public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
68  
69      /** Default bin count */
70      public static final int DEFAULT_BIN_COUNT = 1000;
71  
72      /** Serializable version identifier */
73      private static final long serialVersionUID = 5729073523949762654L;
74  
75      /** List of SummaryStatistics objects characterizing the bins */
76      private final List<SummaryStatistics> binStats;
77  
78      /** Sample statistics */
79      private SummaryStatistics sampleStats = null;
80  
81      /** Max loaded value */
82      private double max = Double.NEGATIVE_INFINITY;
83  
84      /** Min loaded value */
85      private double min = Double.POSITIVE_INFINITY;
86  
87      /** Grid size */
88      private double delta = 0d;
89  
90      /** number of bins */
91      private final int binCount;
92  
93      /** is the distribution loaded? */
94      private boolean loaded = false;
95  
96      /** upper bounds of subintervals in (0,1) "belonging" to the bins */
97      private double[] upperBounds = null;
98  
99      /** RandomDataImpl instance to use in repeated calls to getNext() */
100     private final RandomDataImpl randomData;
101 
102     /**
103      * Creates a new EmpiricalDistribution with the default bin count.
104      */
105     public EmpiricalDistributionImpl() {
106         this(DEFAULT_BIN_COUNT, new RandomDataImpl());
107     }
108 
109     /**
110      * Creates a new EmpiricalDistribution with the specified bin count.
111      *
112      * @param binCount number of bins
113      */
114     public EmpiricalDistributionImpl(int binCount) {
115         this(binCount, new RandomDataImpl());
116     }
117 
118     /**
119      * Creates a new EmpiricalDistribution with the specified bin count using the
120      * provided {@link RandomGenerator} as the source of random data.
121      *
122      * @param binCount number of bins
123      * @param generator random data generator (may be null, resulting in default JDK generator)
124      * @since 3.0
125      */
126     public EmpiricalDistributionImpl(int binCount, RandomGenerator generator) {
127         this.binCount = binCount;
128         randomData = new RandomDataImpl(generator);
129         binStats = new ArrayList<SummaryStatistics>();
130     }
131 
132     /**
133      * Creates a new EmpiricalDistribution with default bin count using the
134      * provided {@link RandomGenerator} as the source of random data.
135      *
136      * @param generator random data generator (may be null, resulting in default JDK generator)
137      * @since 3.0
138      */
139     public EmpiricalDistributionImpl(RandomGenerator generator) {
140         this(DEFAULT_BIN_COUNT, generator);
141     }
142 
143     /**
144      * Creates a new EmpiricalDistribution with the specified bin count using the
145      * provided {@link RandomDataImpl} instance as the source of random data.
146      *
147      * @param binCount number of bins
148      * @param randomData random data generator (may be null, resulting in default JDK generator)
149      * @since 3.0
150      */
151     public EmpiricalDistributionImpl(int binCount, RandomDataImpl randomData) {
152         this.binCount = binCount;
153         this.randomData = randomData;
154         binStats = new ArrayList<SummaryStatistics>();
155     }
156 
157     /**
158      * Creates a new EmpiricalDistribution with default bin count using the
159      * provided {@link RandomDataImpl} as the source of random data.
160      *
161      * @param randomData random data generator (may be null, resulting in default JDK generator)
162      * @since 3.0
163      */
164     public EmpiricalDistributionImpl(RandomDataImpl randomData) {
165         this(DEFAULT_BIN_COUNT, randomData);
166     }
167 
168      /**
169      * Computes the empirical distribution from the provided
170      * array of numbers.
171      *
172      * @param in the input data array
173      * @exception NullArgumentException if in is null
174      */
175     public void load(double[] in) throws NullArgumentException {
176         DataAdapter da = new ArrayDataAdapter(in);
177         try {
178             da.computeStats();
179             fillBinStats(in);
180         } catch (IOException e) {
181             throw new MathIllegalStateException(e, LocalizedFormats.SIMPLE_MESSAGE, e.getLocalizedMessage());
182         }
183         loaded = true;
184 
185     }
186 
187     /**
188      * Computes the empirical distribution using data read from a URL.
189      * @param url  url of the input file
190      *
191      * @throws IOException if an IO error occurs
192      * @throws NullArgumentException if url is null
193      */
194     public void load(URL url) throws IOException, NullArgumentException {
195         MathUtils.checkNotNull(url);
196         BufferedReader in =
197             new BufferedReader(new InputStreamReader(url.openStream()));
198         try {
199             DataAdapter da = new StreamDataAdapter(in);
200             da.computeStats();
201             if (sampleStats.getN() == 0) {
202                 throw new ZeroException(LocalizedFormats.URL_CONTAINS_NO_DATA, url);
203             }
204             in = new BufferedReader(new InputStreamReader(url.openStream()));
205             fillBinStats(in);
206             loaded = true;
207         } finally {
208            try {
209                in.close();
210            } catch (IOException ex) {
211                // ignore
212            }
213         }
214     }
215 
216     /**
217      * Computes the empirical distribution from the input file.
218      *
219      * @param file the input file
220      * @throws IOException if an IO error occurs
221      * @throws NullArgumentException if file is null
222      */
223     public void load(File file) throws IOException, NullArgumentException {
224         MathUtils.checkNotNull(file);
225         BufferedReader in = new BufferedReader(new FileReader(file));
226         try {
227             DataAdapter da = new StreamDataAdapter(in);
228             da.computeStats();
229             in = new BufferedReader(new FileReader(file));
230             fillBinStats(in);
231             loaded = true;
232         } finally {
233             try {
234                 in.close();
235             } catch (IOException ex) {
236                 // ignore
237             }
238         }
239     }
240 
241     /**
242      * Provides methods for computing <code>sampleStats</code> and
243      * <code>beanStats</code> abstracting the source of data.
244      */
245     private abstract class DataAdapter{
246 
247         /**
248          * Compute bin stats.
249          *
250          * @throws IOException  if an error occurs computing bin stats
251          */
252         public abstract void computeBinStats() throws IOException;
253 
254         /**
255          * Compute sample statistics.
256          *
257          * @throws IOException if an error occurs computing sample stats
258          */
259         public abstract void computeStats() throws IOException;
260 
261     }
262 
263     /**
264      * Factory of <code>DataAdapter</code> objects. For every supported source
265      * of data (array of doubles, file, etc.) an instance of the proper object
266      * is returned.
267      */
268     private class DataAdapterFactory{
269         /**
270          * Creates a DataAdapter from a data object
271          *
272          * @param in object providing access to the data
273          * @return DataAdapter instance
274          */
275         public DataAdapter getAdapter(Object in) {
276             if (in instanceof BufferedReader) {
277                 BufferedReader inputStream = (BufferedReader) in;
278                 return new StreamDataAdapter(inputStream);
279             } else if (in instanceof double[]) {
280                 double[] inputArray = (double[]) in;
281                 return new ArrayDataAdapter(inputArray);
282             } else {
283                 throw new MathIllegalArgumentException(
284                       LocalizedFormats.INPUT_DATA_FROM_UNSUPPORTED_DATASOURCE,
285                       in.getClass().getName(),
286                       BufferedReader.class.getName(), double[].class.getName());
287             }
288         }
289     }
290     /**
291      * <code>DataAdapter</code> for data provided through some input stream
292      */
293     private class StreamDataAdapter extends DataAdapter{
294 
295         /** Input stream providing access to the data */
296         private BufferedReader inputStream;
297 
298         /**
299          * Create a StreamDataAdapter from a BufferedReader
300          *
301          * @param in BufferedReader input stream
302          */
303         public StreamDataAdapter(BufferedReader in){
304             super();
305             inputStream = in;
306         }
307 
308         /** {@inheritDoc} */
309         @Override
310         public void computeBinStats() throws IOException {
311             String str = null;
312             double val = 0.0d;
313             while ((str = inputStream.readLine()) != null) {
314                 val = Double.parseDouble(str);
315                 SummaryStatistics stats = binStats.get(findBin(val));
316                 stats.addValue(val);
317             }
318 
319             inputStream.close();
320             inputStream = null;
321         }
322 
323         /** {@inheritDoc} */
324         @Override
325         public void computeStats() throws IOException {
326             String str = null;
327             double val = 0.0;
328             sampleStats = new SummaryStatistics();
329             while ((str = inputStream.readLine()) != null) {
330                 val = Double.valueOf(str).doubleValue();
331                 sampleStats.addValue(val);
332             }
333             inputStream.close();
334             inputStream = null;
335         }
336     }
337 
338     /**
339      * <code>DataAdapter</code> for data provided as array of doubles.
340      */
341     private class ArrayDataAdapter extends DataAdapter {
342 
343         /** Array of input  data values */
344         private double[] inputArray;
345 
346         /**
347          * Construct an ArrayDataAdapter from a double[] array
348          *
349          * @param in double[] array holding the data
350          * @throws NullArgumentException if in is null
351          */
352         public ArrayDataAdapter(double[] in) throws NullArgumentException {
353             super();
354             MathUtils.checkNotNull(in);
355             inputArray = in;
356         }
357 
358         /** {@inheritDoc} */
359         @Override
360         public void computeStats() throws IOException {
361             sampleStats = new SummaryStatistics();
362             for (int i = 0; i < inputArray.length; i++) {
363                 sampleStats.addValue(inputArray[i]);
364             }
365         }
366 
367         /** {@inheritDoc} */
368         @Override
369         public void computeBinStats() throws IOException {
370             for (int i = 0; i < inputArray.length; i++) {
371                 SummaryStatistics stats =
372                     binStats.get(findBin(inputArray[i]));
373                 stats.addValue(inputArray[i]);
374             }
375         }
376     }
377 
378     /**
379      * Fills binStats array (second pass through data file).
380      *
381      * @param in object providing access to the data
382      * @throws IOException  if an IO error occurs
383      */
384     private void fillBinStats(Object in) throws IOException {
385         // Set up grid
386         min = sampleStats.getMin();
387         max = sampleStats.getMax();
388         delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
389 
390         // Initialize binStats ArrayList
391         if (!binStats.isEmpty()) {
392             binStats.clear();
393         }
394         for (int i = 0; i < binCount; i++) {
395             SummaryStatistics stats = new SummaryStatistics();
396             binStats.add(i,stats);
397         }
398 
399         // Filling data in binStats Array
400         DataAdapterFactory aFactory = new DataAdapterFactory();
401         DataAdapter da = aFactory.getAdapter(in);
402         da.computeBinStats();
403 
404         // Assign upperBounds based on bin counts
405         upperBounds = new double[binCount];
406         upperBounds[0] =
407         ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
408         for (int i = 1; i < binCount-1; i++) {
409             upperBounds[i] = upperBounds[i-1] +
410             ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
411         }
412         upperBounds[binCount-1] = 1.0d;
413     }
414 
415     /**
416      * Returns the index of the bin to which the given value belongs
417      *
418      * @param value  the value whose bin we are trying to find
419      * @return the index of the bin containing the value
420      */
421     private int findBin(double value) {
422         return FastMath.min(
423                 FastMath.max((int) FastMath.ceil((value- min) / delta) - 1, 0),
424                 binCount - 1);
425         }
426 
427     /**
428      * Generates a random value from this distribution.
429      *
430      * @return the random value.
431      * @throws MathIllegalStateException if the distribution has not been loaded
432      */
433     public double getNextValue() throws MathIllegalStateException {
434 
435         if (!loaded) {
436             throw new MathIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
437         }
438 
439         // Start with a uniformly distributed random number in (0,1)
440         double x = randomData.nextUniform(0,1);
441 
442         // Use this to select the bin and generate a Gaussian within the bin
443         for (int i = 0; i < binCount; i++) {
444            if (x <= upperBounds[i]) {
445                SummaryStatistics stats = binStats.get(i);
446                if (stats.getN() > 0) {
447                    if (stats.getStandardDeviation() > 0) {  // more than one obs
448                         return randomData.nextGaussian
449                             (stats.getMean(),stats.getStandardDeviation());
450                    } else {
451                        return stats.getMean(); // only one obs in bin
452                    }
453                }
454            }
455         }
456         throw new MathIllegalStateException(LocalizedFormats.NO_BIN_SELECTED);
457     }
458 
459     /**
460      * Returns a {@link StatisticalSummary} describing this distribution.
461      * <strong>Preconditions:</strong><ul>
462      * <li>the distribution must be loaded before invoking this method</li></ul>
463      *
464      * @return the sample statistics
465      * @throws IllegalStateException if the distribution has not been loaded
466      */
467     public StatisticalSummary getSampleStats() {
468         return sampleStats;
469     }
470 
471     /**
472      * Returns the number of bins.
473      *
474      * @return the number of bins.
475      */
476     public int getBinCount() {
477         return binCount;
478     }
479 
480     /**
481      * Returns a List of {@link SummaryStatistics} instances containing
482      * statistics describing the values in each of the bins.  The list is
483      * indexed on the bin number.
484      *
485      * @return List of bin statistics.
486      */
487     public List<SummaryStatistics> getBinStats() {
488         return binStats;
489     }
490 
491     /**
492      * <p>Returns a fresh copy of the array of upper bounds for the bins.
493      * Bins are: <br/>
494      * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
495      *  (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
496      *
497      * <p>Note: In versions 1.0-2.0 of commons-math, this method
498      * incorrectly returned the array of probability generator upper
499      * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
500      *
501      * @return array of bin upper bounds
502      * @since 2.1
503      */
504     public double[] getUpperBounds() {
505         double[] binUpperBounds = new double[binCount];
506         binUpperBounds[0] = min + delta;
507         for (int i = 1; i < binCount - 1; i++) {
508             binUpperBounds[i] = binUpperBounds[i-1] + delta;
509         }
510         binUpperBounds[binCount - 1] = max;
511         return binUpperBounds;
512     }
513 
514     /**
515      * <p>Returns a fresh copy of the array of upper bounds of the subintervals
516      * of [0,1] used in generating data from the empirical distribution.
517      * Subintervals correspond to bins with lengths proportional to bin counts.</p>
518      *
519      * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
520      * by {@link #getUpperBounds()}.</p>
521      *
522      * @since 2.1
523      * @return array of upper bounds of subintervals used in data generation
524      */
525     public double[] getGeneratorUpperBounds() {
526         int len = upperBounds.length;
527         double[] out = new double[len];
528         System.arraycopy(upperBounds, 0, out, 0, len);
529         return out;
530     }
531 
532     /**
533      * Property indicating whether or not the distribution has been loaded.
534      *
535      * @return true if the distribution has been loaded
536      */
537     public boolean isLoaded() {
538         return loaded;
539     }
540 
541     /**
542      * Reseeds the random number generator used by {@link #getNextValue()}.
543      *
544      * @param seed random generator seed
545      * @since 3.0
546      */
547     public void reSeed(long seed) {
548         randomData.reSeed(seed);
549     }
550 }