001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.statistics.distribution;
019
020import org.apache.commons.rng.UniformRandomProvider;
021import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
022
023/**
024 * Implementation of the <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution</a>.
025 * <p>
026 * <strong>Parameters:</strong>
027 * For a random variable {@code X} whose values are distributed according to this
028 * distribution, the probability mass function is given by
029 * <pre>
030 *   P(X = k) = H(N,s) * 1 / k^s    for {@code k = 1,2,...,N}.
031 * </pre>
032 * {@code H(N,s)} is the normalizing constant
033 * which corresponds to the generalized harmonic number of order N of s.
034 * <ul>
035 * <li>{@code N} is the number of elements</li>
036 * <li>{@code s} is the exponent</li>
037 * </ul>
038 */
039public class ZipfDistribution extends AbstractDiscreteDistribution {
040    /** Number of elements. */
041    private final int numberOfElements;
042    /** Exponent parameter of the distribution. */
043    private final double exponent;
044    /** Cached values of the nth generalized harmonic. */
045    private final double nthHarmonic;
046
047    /**
048     * Creates a distribution.
049     *
050     * @param numberOfElements Number of elements.
051     * @param exponent Exponent.
052     * @exception IllegalArgumentException if {@code numberOfElements <= 0}
053     * or {@code exponent <= 0}.
054     */
055    public ZipfDistribution(int numberOfElements,
056                            double exponent) {
057        if (numberOfElements <= 0) {
058            throw new DistributionException(DistributionException.NEGATIVE,
059                                            numberOfElements);
060        }
061        if (exponent <= 0) {
062            throw new DistributionException(DistributionException.NEGATIVE,
063                                            exponent);
064        }
065
066        this.numberOfElements = numberOfElements;
067        this.exponent = exponent;
068        this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
069    }
070
071    /**
072     * Get the number of elements (e.g. corpus size) for the distribution.
073     *
074     * @return the number of elements
075     */
076    public int getNumberOfElements() {
077        return numberOfElements;
078    }
079
080    /**
081     * Get the exponent characterizing the distribution.
082     *
083     * @return the exponent
084     */
085    public double getExponent() {
086        return exponent;
087    }
088
089    /** {@inheritDoc} */
090    @Override
091    public double probability(final int x) {
092        if (x <= 0 || x > numberOfElements) {
093            return 0;
094        }
095
096        return (1 / Math.pow(x, exponent)) / nthHarmonic;
097    }
098
099    /** {@inheritDoc} */
100    @Override
101    public double logProbability(int x) {
102        if (x <= 0 || x > numberOfElements) {
103            return Double.NEGATIVE_INFINITY;
104        }
105
106        return -Math.log(x) * exponent - Math.log(nthHarmonic);
107    }
108
109    /** {@inheritDoc} */
110    @Override
111    public double cumulativeProbability(final int x) {
112        if (x <= 0) {
113            return 0;
114        } else if (x >= numberOfElements) {
115            return 1;
116        }
117
118        return generalizedHarmonic(x, exponent) / nthHarmonic;
119    }
120
121    /**
122     * {@inheritDoc}
123     *
124     * For number of elements {@code N} and exponent {@code s}, the mean is
125     * {@code Hs1 / Hs}, where
126     * <ul>
127     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
128     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
129     * </ul>
130     */
131    @Override
132    public double getMean() {
133        final int N = getNumberOfElements();
134        final double s = getExponent();
135
136        final double Hs1 = generalizedHarmonic(N, s - 1);
137        final double Hs = nthHarmonic;
138
139        return Hs1 / Hs;
140    }
141
142    /**
143     * {@inheritDoc}
144     *
145     * For number of elements {@code N} and exponent {@code s}, the mean is
146     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
147     * <ul>
148     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
149     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
150     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
151     * </ul>
152     */
153    @Override
154    public double getVariance() {
155        final int N = getNumberOfElements();
156        final double s = getExponent();
157
158        final double Hs2 = generalizedHarmonic(N, s - 2);
159        final double Hs1 = generalizedHarmonic(N, s - 1);
160        final double Hs = nthHarmonic;
161
162        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
163    }
164
165    /**
166     * Calculates the Nth generalized harmonic number. See
167     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
168     * Series</a>.
169     *
170     * @param n Term in the series to calculate (must be larger than 1)
171     * @param m Exponent (special case {@code m = 1} is the harmonic series).
172     * @return the n<sup>th</sup> generalized harmonic number.
173     */
174    private double generalizedHarmonic(final int n, final double m) {
175        double value = 0;
176        for (int k = n; k > 0; --k) {
177            value += 1 / Math.pow(k, m);
178        }
179        return value;
180    }
181
182    /**
183     * {@inheritDoc}
184     *
185     * The lower bound of the support is always 1 no matter the parameters.
186     *
187     * @return lower bound of the support (always 1)
188     */
189    @Override
190    public int getSupportLowerBound() {
191        return 1;
192    }
193
194    /**
195     * {@inheritDoc}
196     *
197     * The upper bound of the support is the number of elements.
198     *
199     * @return upper bound of the support
200     */
201    @Override
202    public int getSupportUpperBound() {
203        return getNumberOfElements();
204    }
205
206    /**
207     * {@inheritDoc}
208     *
209     * The support of this distribution is connected.
210     *
211     * @return {@code true}
212     */
213    @Override
214    public boolean isSupportConnected() {
215        return true;
216    }
217
218    /**{@inheritDoc} */
219    @Override
220    public DiscreteDistribution.Sampler createSampler(final UniformRandomProvider rng) {
221        // Zipf distribution sampler.
222        return new RejectionInversionZipfSampler(rng, numberOfElements, exponent)::sample;
223    }
224}