Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021import org.apache.commons.math3.exception.util.LocalizedFormats;
022import org.apache.commons.math3.random.RandomGenerator;
023import org.apache.commons.math3.random.Well19937c;
024import org.apache.commons.math3.util.FastMath;
025
026/**
027 * Implementation of the Zipf distribution.
028 * <p>
029 * <strong>Parameters:</strong>
030 * For a random variable {@code X} whose values are distributed according to this
031 * distribution, the probability mass function is given by
032 * <pre>
033 *   P(X = k) = H(N,s) * 1 / k^s    for {@code k = 1,2,...,N}.
034 * </pre>
035 * {@code H(N,s)} is the normalizing constant
036 * which corresponds to the generalized harmonic number of order N of s.
037 * <p>
038 * <ul>
039 * <li>{@code N} is the number of elements</li>
040 * <li>{@code s} is the exponent</li>
041 * </ul>
042 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
043 * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a>
044 */
045public class ZipfDistribution extends AbstractIntegerDistribution {
046    /** Serializable version identifier. */
047    private static final long serialVersionUID = -140627372283420404L;
048    /** Number of elements. */
049    private final int numberOfElements;
050    /** Exponent parameter of the distribution. */
051    private final double exponent;
052    /** Cached numerical mean */
053    private double numericalMean = Double.NaN;
054    /** Whether or not the numerical mean has been calculated */
055    private boolean numericalMeanIsCalculated = false;
056    /** Cached numerical variance */
057    private double numericalVariance = Double.NaN;
058    /** Whether or not the numerical variance has been calculated */
059    private boolean numericalVarianceIsCalculated = false;
060    /** The sampler to be used for the sample() method */
061    private transient ZipfRejectionInversionSampler sampler;
062
063    /**
064     * Create a new Zipf distribution with the given number of elements and
065     * exponent.
066     * <p>
067     * <b>Note:</b> this constructor will implicitly create an instance of
068     * {@link Well19937c} as random generator to be used for sampling only (see
069     * {@link #sample()} and {@link #sample(int)}). In case no sampling is
070     * needed for the created distribution, it is advised to pass {@code null}
071     * as random generator via the appropriate constructors to avoid the
072     * additional initialisation overhead.
073     *
074     * @param numberOfElements Number of elements.
075     * @param exponent Exponent.
076     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
077     * or {@code exponent <= 0}.
078     */
079    public ZipfDistribution(final int numberOfElements, final double exponent) {
080        this(new Well19937c(), numberOfElements, exponent);
081    }
082
083    /**
084     * Creates a Zipf distribution.
085     *
086     * @param rng Random number generator.
087     * @param numberOfElements Number of elements.
088     * @param exponent Exponent.
089     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
090     * or {@code exponent <= 0}.
091     * @since 3.1
092     */
093    public ZipfDistribution(RandomGenerator rng,
094                            int numberOfElements,
095                            double exponent)
096        throws NotStrictlyPositiveException {
097        super(rng);
098
099        if (numberOfElements <= 0) {
100            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
101                                                   numberOfElements);
102        }
103        if (exponent <= 0) {
104            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
105                                                   exponent);
106        }
107
108        this.numberOfElements = numberOfElements;
109        this.exponent = exponent;
110    }
111
112    /**
113     * Get the number of elements (e.g. corpus size) for the distribution.
114     *
115     * @return the number of elements
116     */
117    public int getNumberOfElements() {
118        return numberOfElements;
119    }
120
121    /**
122     * Get the exponent characterizing the distribution.
123     *
124     * @return the exponent
125     */
126    public double getExponent() {
127        return exponent;
128    }
129
130    /** {@inheritDoc} */
131    public double probability(final int x) {
132        if (x <= 0 || x > numberOfElements) {
133            return 0.0;
134        }
135
136        return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
137    }
138
139    /** {@inheritDoc} */
140    @Override
141    public double logProbability(int x) {
142        if (x <= 0 || x > numberOfElements) {
143            return Double.NEGATIVE_INFINITY;
144        }
145
146        return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
147    }
148
149    /** {@inheritDoc} */
150    public double cumulativeProbability(final int x) {
151        if (x <= 0) {
152            return 0.0;
153        } else if (x >= numberOfElements) {
154            return 1.0;
155        }
156
157        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
158    }
159
160    /**
161     * {@inheritDoc}
162     *
163     * For number of elements {@code N} and exponent {@code s}, the mean is
164     * {@code Hs1 / Hs}, where
165     * <ul>
166     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
167     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
168     * </ul>
169     */
170    public double getNumericalMean() {
171        if (!numericalMeanIsCalculated) {
172            numericalMean = calculateNumericalMean();
173            numericalMeanIsCalculated = true;
174        }
175        return numericalMean;
176    }
177
178    /**
179     * Used by {@link #getNumericalMean()}.
180     *
181     * @return the mean of this distribution
182     */
183    protected double calculateNumericalMean() {
184        final int N = getNumberOfElements();
185        final double s = getExponent();
186
187        final double Hs1 = generalizedHarmonic(N, s - 1);
188        final double Hs = generalizedHarmonic(N, s);
189
190        return Hs1 / Hs;
191    }
192
193    /**
194     * {@inheritDoc}
195     *
196     * For number of elements {@code N} and exponent {@code s}, the mean is
197     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
198     * <ul>
199     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
200     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
201     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
202     * </ul>
203     */
204    public double getNumericalVariance() {
205        if (!numericalVarianceIsCalculated) {
206            numericalVariance = calculateNumericalVariance();
207            numericalVarianceIsCalculated = true;
208        }
209        return numericalVariance;
210    }
211
212    /**
213     * Used by {@link #getNumericalVariance()}.
214     *
215     * @return the variance of this distribution
216     */
217    protected double calculateNumericalVariance() {
218        final int N = getNumberOfElements();
219        final double s = getExponent();
220
221        final double Hs2 = generalizedHarmonic(N, s - 2);
222        final double Hs1 = generalizedHarmonic(N, s - 1);
223        final double Hs = generalizedHarmonic(N, s);
224
225        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
226    }
227
228    /**
229     * Calculates the Nth generalized harmonic number. See
230     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
231     * Series</a>.
232     *
233     * @param n Term in the series to calculate (must be larger than 1)
234     * @param m Exponent (special case {@code m = 1} is the harmonic series).
235     * @return the n<sup>th</sup> generalized harmonic number.
236     */
237    private double generalizedHarmonic(final int n, final double m) {
238        double value = 0;
239        for (int k = n; k > 0; --k) {
240            value += 1.0 / FastMath.pow(k, m);
241        }
242        return value;
243    }
244
245    /**
246     * {@inheritDoc}
247     *
248     * The lower bound of the support is always 1 no matter the parameters.
249     *
250     * @return lower bound of the support (always 1)
251     */
252    public int getSupportLowerBound() {
253        return 1;
254    }
255
256    /**
257     * {@inheritDoc}
258     *
259     * The upper bound of the support is the number of elements.
260     *
261     * @return upper bound of the support
262     */
263    public int getSupportUpperBound() {
264        return getNumberOfElements();
265    }
266
267    /**
268     * {@inheritDoc}
269     *
270     * The support of this distribution is connected.
271     *
272     * @return {@code true}
273     */
274    public boolean isSupportConnected() {
275        return true;
276    }
277
278    /**
279     * {@inheritDoc}
280     */
281    @Override
282    public int sample() {
283        if (sampler == null) {
284            sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent);
285        }
286        return sampler.sample(random);
287    }
288
289    /**
290     * Utility class implementing a rejection inversion sampling method for a discrete,
291     * bounded Zipf distribution that is based on the method described in
292     * <p>
293     * Wolfgang HÃ¶rmann and Gerhard Derflinger
294     * "Rejection-inversion to generate variates from monotone discrete distributions."
295     * ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996): 169-184.
296     * <p>
297     * The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI).
298     * The original method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)}
299     * as the integral of the hat function. This function is undefined for
300     * q = 1, which is the reason for the limitation of the exponent.
301     * If instead the integral function
302     * {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used,
303     * for which a meaningful limit exists for q = 1,
304     * the method works for all positive exponents.
305     * <p>
306     * The following implementation uses v := 0 and generates integral numbers
307     * in the range [1, numberOfElements]. This is different to the original method
308     * where v is defined to be positive and numbers are taken from [0, i_max].
309     * This explains why the implementation looks slightly different.
310     *
311     * @since 3.6
312     */
313    static final class ZipfRejectionInversionSampler {
314
315        /** Exponent parameter of the distribution. */
316        private final double exponent;
317        /** Number of elements. */
318        private final int numberOfElements;
319        /** Constant equal to {@code hIntegral(1.5) - 1}. */
320        private final double hIntegralX1;
321        /** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */
322        private final double hIntegralNumberOfElements;
323        /** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */
324        private final double s;
325
326        /** Simple constructor.
327         * @param numberOfElements number of elements
328         * @param exponent exponent parameter of the distribution
329         */
330        ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) {
331            this.exponent = exponent;
332            this.numberOfElements = numberOfElements;
333            this.hIntegralX1 = hIntegral(1.5) - 1d;
334            this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5);
335            this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2));
336        }
337
338        /** Generate one integral number in the range [1, numberOfElements].
339         * @param random random generator to use
340         * @return generated integral number in the range [1, numberOfElements]
341         */
342        int sample(final RandomGenerator random) {
343            while(true) {
344
345                final double u = hIntegralNumberOfElements + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements);
346                // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements]
347
348                double x = hIntegralInverse(u);
349
350                int k = (int)(x + 0.5);
351
352                // Limit k to the range [1, numberOfElements]
353                // (k could be outside due to numerical inaccuracies)
354                if (k < 1) {
355                    k = 1;
356                }
357                else if (k > numberOfElements) {
358                    k = numberOfElements;
359                }
360
361                // Here, the distribution of k is given by:
362                //
363                //   P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C
364                //   P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2
365                //
366                //   where C := 1 / (hIntegralNumberOfElements - hIntegralX1)
367
368                if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) {
369
370                    // Case k = 1:
371                    //
372                    //   The right inequality is always true, because replacing k by 1 gives
373                    //   u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from
374                    //   (hIntegralX1, hIntegralNumberOfElements].
375                    //
376                    //   Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1
377                    //   and the probability that 1 is returned as random value is
378                    //   P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent
379                    //
380                    // Case k >= 2:
381                    //
382                    //   The left inequality (k - x <= s) is just a short cut
383                    //   to avoid the more expensive evaluation of the right inequality
384                    //   (u >= hIntegral(k + 0.5) - h(k)) in many cases.
385                    //
386                    //   If the left inequality is true, the right inequality is also true:
387                    //     Theorem 2 in the paper is valid for all positive exponents, because
388                    //     the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
389                    //     (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0
390                    //     are both fulfilled.
391                    //     Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x))
392                    //     is a non-decreasing function. If k - x <= s holds,
393                    //     k - x <= s + f(k) - f(2) is obviously also true which is equivalent to
394                    //     -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
395                    //     -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
396                    //     and finally u >= hIntegral(k + 0.5) - h(k).
397                    //
398                    //   Hence, the right inequality determines the acceptance rate:
399                    //   P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2))
400                    //   The probability that m is returned is given by
401                    //   P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent.
402                    //
403                    // In both cases the probabilities are proportional to the probability mass function
404                    // of the Zipf distribution.
405
406                    return k;
407                }
408            }
409        }
410
411        /**
412         * {@code H(x) :=}
413         * <ul>
414         * <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}</li>
415         * <li>{@code log(x)}, if {@code exponent == 1}</li>
416         * </ul>
417         * H(x) is an integral function of h(x),
418         * the derivative of H(x) is h(x).
419         *
420         * @param x free parameter
421         * @return {@code H(x)}
422         */
423        private double hIntegral(final double x) {
424            final double logX = FastMath.log(x);
425            return helper2((1d-exponent)*logX)*logX;
426        }
427
428        /**
429         * {@code h(x) := 1/x^exponent}
430         *
431         * @param x free parameter
432         * @return h(x)
433         */
434        private double h(final double x) {
435            return FastMath.exp(-exponent * FastMath.log(x));
436        }
437
438        /**
439         * The inverse function of H(x).
440         *
441         * @param x free parameter
442         * @return y for which {@code H(y) = x}
443         */
444        private double hIntegralInverse(final double x) {
445            double t = x*(1d-exponent);
446            if (t < -1d) {
447                // Limit value to the range [-1, +inf).
448                // t could be smaller than -1 in some rare cases due to numerical errors.
449                t = -1;
450            }
451            return FastMath.exp(helper1(t)*x);
452        }
453
454        /**
455         * Helper function that calculates {@code log(1+x)/x}.
456         * <p>
457         * A Taylor series expansion is used, if x is close to 0.
458         *
459         * @param x a value larger than or equal to -1
460         * @return {@code log(1+x)/x}
461         */
462        static double helper1(final double x) {
463            if (FastMath.abs(x)>1e-8) {
464                return FastMath.log1p(x)/x;
465            }
466            else {
467                return 1.-x*((1./2.)-x*((1./3.)-x*(1./4.)));
468            }
469        }
470
471        /**
472         * Helper function to calculate {@code (exp(x)-1)/x}.
473         * <p>
474         * A Taylor series expansion is used, if x is close to 0.
475         *
476         * @param x free parameter
477         * @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0
478         */
479        static double helper2(final double x) {
480            if (FastMath.abs(x)>1e-8) {
481                return FastMath.expm1(x)/x;
482            }
483            else {
484                return 1.+x*(1./2.)*(1.+x*(1./3.)*(1.+x*(1./4.)));
485            }
486        }
487    }
488}