001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.statistics.distribution;
019
020import org.apache.commons.rng.UniformRandomProvider;
021import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
022
023/**
024 * Implementation of the Zipf distribution.
025 *
026 * <p>The probability mass function of \( X \) is:
027 *
028 * <p>\[ f(k; N, s) = \frac{1/k^s}{H_{N,s}} \]
029 *
030 * <p>for \( N \in \{1, 2, 3, \dots\} \) the number of elements,
031 * \( s \gt 0 \) the exponent characterizing the distribution,
032 * \( k \in \{1, 2, \dots, N\} \) the element rank, and
033 * \( H_{N,s} \) is the normalizing constant which corresponds to the
034 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
035 * generalized harmonic number</a> of order N of s.
036 *
037 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution (Wikipedia)</a>
038 */
039public final class ZipfDistribution extends AbstractDiscreteDistribution {
040    /** Number of elements. */
041    private final int numberOfElements;
042    /** Exponent parameter of the distribution. */
043    private final double exponent;
044    /** Cached value of the nth generalized harmonic. */
045    private final double nthHarmonic;
046    /** Cached value of the log of the nth generalized harmonic. */
047    private final double logNthHarmonic;
048
049    /**
050     * @param numberOfElements Number of elements.
051     * @param exponent Exponent.
052     */
053    private ZipfDistribution(int numberOfElements,
054                             double exponent) {
055        this.numberOfElements = numberOfElements;
056        this.exponent = exponent;
057        this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
058        logNthHarmonic = Math.log(nthHarmonic);
059    }
060
061    /**
062     * Creates a Zipf distribution.
063     *
064     * @param numberOfElements Number of elements.
065     * @param exponent Exponent.
066     * @return the distribution
067     * @exception IllegalArgumentException if {@code numberOfElements <= 0}
068     * or {@code exponent <= 0}.
069     */
070    public static ZipfDistribution of(int numberOfElements,
071                                      double exponent) {
072        if (numberOfElements <= 0) {
073            throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE,
074                                            numberOfElements);
075        }
076        if (exponent < 0) {
077            throw new DistributionException(DistributionException.NEGATIVE,
078                                            exponent);
079        }
080        return new ZipfDistribution(numberOfElements, exponent);
081    }
082
083    /**
084     * Gets the number of elements parameter of this distribution.
085     *
086     * @return the number of elements.
087     */
088    public int getNumberOfElements() {
089        return numberOfElements;
090    }
091
092    /**
093     * Gets the exponent parameter of this distribution.
094     *
095     * @return the exponent.
096     */
097    public double getExponent() {
098        return exponent;
099    }
100
101    /** {@inheritDoc} */
102    @Override
103    public double probability(final int x) {
104        if (x <= 0 || x > numberOfElements) {
105            return 0;
106        }
107
108        return Math.pow(x, -exponent) / nthHarmonic;
109    }
110
111    /** {@inheritDoc} */
112    @Override
113    public double logProbability(int x) {
114        if (x <= 0 || x > numberOfElements) {
115            return Double.NEGATIVE_INFINITY;
116        }
117
118        return -Math.log(x) * exponent - logNthHarmonic;
119    }
120
121    /** {@inheritDoc} */
122    @Override
123    public double cumulativeProbability(final int x) {
124        if (x <= 0) {
125            return 0;
126        } else if (x >= numberOfElements) {
127            return 1;
128        }
129
130        return generalizedHarmonic(x, exponent) / nthHarmonic;
131    }
132
133    /** {@inheritDoc} */
134    @Override
135    public double survivalProbability(int x) {
136        if (x <= 0) {
137            return 1;
138        } else if (x >= numberOfElements) {
139            return 0;
140        }
141
142        // See http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
143        // S(x) = P(X > x) = ((x+1)^a Hn,a - (x+1)^a Hx+1,a + 1) / ((x+1)^a Hn,a)
144        // where a = exponent and Hx,a is the generalized harmonic for x with exponent a.
145        final double z = Math.pow(x + 1.0, exponent);
146        // Compute generalizedHarmonic(x, exponent) and generalizedHarmonic(x+1, exponent)
147        final double hx = generalizedHarmonic(x, exponent);
148        final double hx1 = hx + Math.pow(x + 1.0, -exponent);
149        // Compute the survival function
150        final double p = (z * (nthHarmonic - hx1) + 1) / (z * nthHarmonic);
151        // May overflow for large exponent so validate the probability.
152        // If this occurs revert to 1 - CDF(x), reusing the generalized harmonic for x
153        return p <= 1.0 ? p : 1.0 - hx / nthHarmonic;
154    }
155
156    /**
157     * {@inheritDoc}
158     *
159     * <p>For number of elements \( N \) and exponent \( s \), the mean is:
160     *
161     * <p>\[ \frac{H_{N,s-1}}{H_{N,s}} \]
162     *
163     * <p>where \( H_{N,k} \) is the
164     * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
165     * generalized harmonic number</a> of order \( N \) of \( k \).
166     */
167    @Override
168    public double getMean() {
169        final int N = getNumberOfElements();
170        final double s = getExponent();
171
172        final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
173
174        return Hs1 / nthHarmonic;
175    }
176
177    /**
178     * {@inheritDoc}
179     *
180     * <p>For number of elements \( N \) and exponent \( s \), the variance is:
181     *
182     * <p>\[ \frac{H_{N,s-2}}{H_{N,s}} - \frac{H_{N,s-1}^2}{H_{N,s}^2} \]
183     *
184     * <p>where \( H_{N,k} \) is the
185     * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
186     * generalized harmonic number</a> of order \( N \) of \( k \).
187     */
188    @Override
189    public double getVariance() {
190        final int N = getNumberOfElements();
191        final double s = getExponent();
192
193        final double Hs2 = generalizedHarmonicAscendingSum(N, s - 2);
194        final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
195        final double Hs = nthHarmonic;
196
197        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
198    }
199
200    /**
201     * Calculates the Nth generalized harmonic number. See
202     * <a href="https://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
203     * Series</a>.
204     *
205     * <p>Assumes {@code exponent > 0} to arrange the terms to sum from small to large.
206     *
207     * @param n Term in the series to calculate (must be larger than 1)
208     * @param m Exponent (special case {@code m = 1} is the harmonic series).
209     * @return the n<sup>th</sup> generalized harmonic number.
210     */
211    private static double generalizedHarmonic(final int n, final double m) {
212        double value = 0;
213        // Sum small to large
214        for (int k = n; k >= 1; k--) {
215            value += Math.pow(k, -m);
216        }
217        return value;
218    }
219
220    /**
221     * Calculates the Nth generalized harmonic number.
222     *
223     * <p>Checks the value of the {@code exponent} to arrange the terms to sum from from small to large.
224     *
225     * @param n Term in the series to calculate (must be larger than 1)
226     * @param m Exponent (special case {@code m = 1} is the harmonic series).
227     * @return the n<sup>th</sup> generalized harmonic number.
228     */
229    private static double generalizedHarmonicAscendingSum(final int n, final double m) {
230        double value = 0;
231        // Sum small to large
232        // If m < 0 then sum ascending, otherwise descending
233        if (m < 0) {
234            for (int k = 1; k <= n; k++) {
235                value += Math.pow(k, -m);
236            }
237        } else {
238            for (int k = n; k >= 1; k--) {
239                value += Math.pow(k, -m);
240            }
241        }
242        return value;
243    }
244
245    /**
246     * {@inheritDoc}
247     *
248     * <p>The lower bound of the support is always 1.
249     *
250     * @return 1.
251     */
252    @Override
253    public int getSupportLowerBound() {
254        return 1;
255    }
256
257    /**
258     * {@inheritDoc}
259     *
260     * <p>The upper bound of the support is the number of elements.
261     *
262     * @return number of elements.
263     */
264    @Override
265    public int getSupportUpperBound() {
266        return getNumberOfElements();
267    }
268
269    /** {@inheritDoc} */
270    @Override
271    public DiscreteDistribution.Sampler createSampler(final UniformRandomProvider rng) {
272        // Zipf distribution sampler.
273        return RejectionInversionZipfSampler.of(rng, numberOfElements, exponent)::sample;
274    }
275}