001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021import org.apache.commons.math3.exception.util.LocalizedFormats;
022import org.apache.commons.math3.random.RandomGenerator;
023import org.apache.commons.math3.random.Well19937c;
024import org.apache.commons.math3.util.FastMath;
025
026/**
027 * Implementation of the Zipf distribution.
028 *
029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
030 */
031public class ZipfDistribution extends AbstractIntegerDistribution {
032    /** Serializable version identifier. */
033    private static final long serialVersionUID = -140627372283420404L;
034    /** Number of elements. */
035    private final int numberOfElements;
036    /** Exponent parameter of the distribution. */
037    private final double exponent;
038    /** Cached numerical mean */
039    private double numericalMean = Double.NaN;
040    /** Whether or not the numerical mean has been calculated */
041    private boolean numericalMeanIsCalculated = false;
042    /** Cached numerical variance */
043    private double numericalVariance = Double.NaN;
044    /** Whether or not the numerical variance has been calculated */
045    private boolean numericalVarianceIsCalculated = false;
046    /** The sampler to be used for the sample() method */
047    private transient ZipfRejectionSampler sampler;
048
049    /**
050     * Create a new Zipf distribution with the given number of elements and
051     * exponent.
052     * <p>
053     * <b>Note:</b> this constructor will implicitly create an instance of
054     * {@link Well19937c} as random generator to be used for sampling only (see
055     * {@link #sample()} and {@link #sample(int)}). In case no sampling is
056     * needed for the created distribution, it is advised to pass {@code null}
057     * as random generator via the appropriate constructors to avoid the
058     * additional initialisation overhead.
059     *
060     * @param numberOfElements Number of elements.
061     * @param exponent Exponent.
062     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
063     * or {@code exponent <= 0}.
064     */
065    public ZipfDistribution(final int numberOfElements, final double exponent) {
066        this(new Well19937c(), numberOfElements, exponent);
067    }
068
069    /**
070     * Creates a Zipf distribution.
071     *
072     * @param rng Random number generator.
073     * @param numberOfElements Number of elements.
074     * @param exponent Exponent.
075     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
076     * or {@code exponent <= 0}.
077     * @since 3.1
078     */
079    public ZipfDistribution(RandomGenerator rng,
080                            int numberOfElements,
081                            double exponent)
082        throws NotStrictlyPositiveException {
083        super(rng);
084
085        if (numberOfElements <= 0) {
086            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
087                                                   numberOfElements);
088        }
089        if (exponent <= 0) {
090            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
091                                                   exponent);
092        }
093
094        this.numberOfElements = numberOfElements;
095        this.exponent = exponent;
096    }
097
098    /**
099     * Get the number of elements (e.g. corpus size) for the distribution.
100     *
101     * @return the number of elements
102     */
103    public int getNumberOfElements() {
104        return numberOfElements;
105    }
106
107    /**
108     * Get the exponent characterizing the distribution.
109     *
110     * @return the exponent
111     */
112    public double getExponent() {
113        return exponent;
114    }
115
116    /** {@inheritDoc} */
117    public double probability(final int x) {
118        if (x <= 0 || x > numberOfElements) {
119            return 0.0;
120        }
121
122        return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
123    }
124
125    /** {@inheritDoc} */
126    @Override
127    public double logProbability(int x) {
128        if (x <= 0 || x > numberOfElements) {
129            return Double.NEGATIVE_INFINITY;
130        }
131
132        return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
133    }
134
135    /** {@inheritDoc} */
136    public double cumulativeProbability(final int x) {
137        if (x <= 0) {
138            return 0.0;
139        } else if (x >= numberOfElements) {
140            return 1.0;
141        }
142
143        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
144    }
145
146    /**
147     * {@inheritDoc}
148     *
149     * For number of elements {@code N} and exponent {@code s}, the mean is
150     * {@code Hs1 / Hs}, where
151     * <ul>
152     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
153     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
154     * </ul>
155     */
156    public double getNumericalMean() {
157        if (!numericalMeanIsCalculated) {
158            numericalMean = calculateNumericalMean();
159            numericalMeanIsCalculated = true;
160        }
161        return numericalMean;
162    }
163
164    /**
165     * Used by {@link #getNumericalMean()}.
166     *
167     * @return the mean of this distribution
168     */
169    protected double calculateNumericalMean() {
170        final int N = getNumberOfElements();
171        final double s = getExponent();
172
173        final double Hs1 = generalizedHarmonic(N, s - 1);
174        final double Hs = generalizedHarmonic(N, s);
175
176        return Hs1 / Hs;
177    }
178
179    /**
180     * {@inheritDoc}
181     *
182     * For number of elements {@code N} and exponent {@code s}, the mean is
183     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
184     * <ul>
185     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
186     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
187     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
188     * </ul>
189     */
190    public double getNumericalVariance() {
191        if (!numericalVarianceIsCalculated) {
192            numericalVariance = calculateNumericalVariance();
193            numericalVarianceIsCalculated = true;
194        }
195        return numericalVariance;
196    }
197
198    /**
199     * Used by {@link #getNumericalVariance()}.
200     *
201     * @return the variance of this distribution
202     */
203    protected double calculateNumericalVariance() {
204        final int N = getNumberOfElements();
205        final double s = getExponent();
206
207        final double Hs2 = generalizedHarmonic(N, s - 2);
208        final double Hs1 = generalizedHarmonic(N, s - 1);
209        final double Hs = generalizedHarmonic(N, s);
210
211        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
212    }
213
214    /**
215     * Calculates the Nth generalized harmonic number. See
216     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
217     * Series</a>.
218     *
219     * @param n Term in the series to calculate (must be larger than 1)
220     * @param m Exponent (special case {@code m = 1} is the harmonic series).
221     * @return the n<sup>th</sup> generalized harmonic number.
222     */
223    private double generalizedHarmonic(final int n, final double m) {
224        double value = 0;
225        for (int k = n; k > 0; --k) {
226            value += 1.0 / FastMath.pow(k, m);
227        }
228        return value;
229    }
230
231    /**
232     * {@inheritDoc}
233     *
234     * The lower bound of the support is always 1 no matter the parameters.
235     *
236     * @return lower bound of the support (always 1)
237     */
238    public int getSupportLowerBound() {
239        return 1;
240    }
241
242    /**
243     * {@inheritDoc}
244     *
245     * The upper bound of the support is the number of elements.
246     *
247     * @return upper bound of the support
248     */
249    public int getSupportUpperBound() {
250        return getNumberOfElements();
251    }
252
253    /**
254     * {@inheritDoc}
255     *
256     * The support of this distribution is connected.
257     *
258     * @return {@code true}
259     */
260    public boolean isSupportConnected() {
261        return true;
262    }
263
264    /**
265     * {@inheritDoc}
266     * <p>
267     * An instrumental distribution g(k) is used to generate random values by
268     * rejection sampling. g(k) is defined as g(1):= 1 and g(k) := I(-s,k-1/2,k+1/2)
269     * for k larger than 1, where s denotes the exponent of the Zipf distribution
270     * and I(r,a,b) is the integral of x^r for x from a to b.
271     * <p>
272     * Since 1^x^s is a convex function, Jensens's inequality gives
273     * I(-s,k-1/2,k+1/2) >= 1/k^s for all positive k and non-negative s.
274     * In order to limit the rejection rate for large exponents s,
275     * the instrumental distribution weight is differently defined for value 1.
276     */
277    @Override
278    public int sample() {
279        if (sampler == null) {
280            sampler = new ZipfRejectionSampler(numberOfElements, exponent);
281        }
282        return sampler.sample(random);
283    }
284
285    /**
286     * Utility class implementing a rejection sampling method for a discrete,
287     * bounded Zipf distribution.
288     *
289     * @since 3.6
290     */
291    static final class ZipfRejectionSampler {
292
293        /** Number of elements. */
294        private final int numberOfElements;
295        /** Exponent parameter of the distribution. */
296        private final double exponent;
297        /** Cached tail weight of instrumental distribution used for rejection sampling */
298        private double instrumentalDistributionTailWeight = Double.NaN;
299
300        /**
301         * Simple constructor.
302         * @param numberOfElements number of elements
303         * @param exponent exponent parameter of the distribution
304         */
305        ZipfRejectionSampler(final int numberOfElements, final double exponent) {
306            this.numberOfElements = numberOfElements;
307            this.exponent = exponent;
308        }
309
310        /** Generate a random value sampled from this distribution.
311         * @param random random generator to use
312         * @return random value sampled from this distribution
313         */
314        int sample(final RandomGenerator random) {
315            if (Double.isNaN(instrumentalDistributionTailWeight)) {
316                instrumentalDistributionTailWeight = integratePowerFunction(-exponent, 1.5, numberOfElements+0.5);
317            }
318
319            while(true) {
320                final double randomValue = random.nextDouble()*(instrumentalDistributionTailWeight + 1.);
321                if (randomValue < instrumentalDistributionTailWeight) {
322                    final double q = randomValue / instrumentalDistributionTailWeight;
323                    final int sample = sampleFromInstrumentalDistributionTail(q);
324                    if (random.nextDouble() < acceptanceRateForTailSample(sample)) {
325                        return sample;
326                    }
327                }
328                else {
329                    return 1;
330                }
331            }
332        }
333
334        /**
335         * Returns a sample from the instrumental distribution tail for a given
336         * uniformly distributed random value.
337         *
338         * @param q a uniformly distributed random value taken from [0,1]
339         * @return a sample in the range [2, {@link #numberOfElements}]
340         */
341        int sampleFromInstrumentalDistributionTail(double q) {
342            final double a = 1.5;
343            final double b = numberOfElements + 0.5;
344            final double logBdviA = FastMath.log(b / a);
345
346            final int result  = (int) (a * FastMath.exp(logBdviA * helper1(q, logBdviA * (1. - exponent))) + 0.5);
347            if (result < 2) {
348                return 2;
349            }
350            if (result > numberOfElements) {
351                return numberOfElements;
352            }
353            return result;
354        }
355
356        /**
357         * Helper function that calculates log((1-q)+q*exp(x))/x.
358         * <p>
359         * A Taylor series expansion is used, if x is close to 0.
360         *
361         * @param q a value in the range [0,1]
362         * @param x free parameter
363         * @return log((1-q)+q*exp(x))/x
364         */
365        static double helper1(final double q, final double x) {
366            if (Math.abs(x) > 1e-8) {
367                return FastMath.log((1.-q)+q*FastMath.exp(x))/x;
368            }
369            else {
370                return q*(1.+(1./2.)*x*(1.-q)*(1+(1./3.)*x*((1.-2.*q) + (1./4.)*x*(6*q*q*(q-1)+1))));
371            }
372        }
373
374        /**
375         * Helper function to calculate (exp(x)-1)/x.
376         * <p>
377         * A Taylor series expansion is used, if x is close to 0.
378         *
379         * @param x free parameter
380         * @return (exp(x)-1)/x if x is non-zero, 1 if x=0
381         */
382        static double helper2(final double x) {
383            if (FastMath.abs(x)>1e-8) {
384                return FastMath.expm1(x)/x;
385            }
386            else {
387                return 1.+x*(1./2.)*(1.+x*(1./3.)*(1.+x*(1./4.)));
388            }
389        }
390
391        /**
392         * Integrates the power function x^r from x=a to b.
393         *
394         * @param r the exponent
395         * @param a the integral lower bound
396         * @param b the integral upper bound
397         * @return the calculated integral value
398         */
399        static double integratePowerFunction(final double r, final double a, final double b) {
400            final double logA = FastMath.log(a);
401            final double logBdivA = FastMath.log(b/a);
402            return FastMath.exp((1.+r)*logA)*helper2((1.+r)*logBdivA)*logBdivA;
403
404        }
405
406        /**
407         * Calculates the acceptance rate for a sample taken from the tail of the instrumental distribution.
408         * <p>
409         * The acceptance rate is given by the ratio k^(-s)/I(-s,k-0.5, k+0.5)
410         * where I(r,a,b) is the integral of x^r for x from a to b.
411         *
412         * @param k the value which has been sampled using the instrumental distribution
413         * @return the acceptance rate
414         */
415        double acceptanceRateForTailSample(int k) {
416            final double a = FastMath.log1p(1./(2.*k-1.));
417            final double b = FastMath.log1p(2./(2.*k-1.));
418            return FastMath.exp((1.-exponent)*a)/(k*b*helper2((1.-exponent)*b));
419        }
420    }
421
422}