001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021import org.apache.commons.math3.exception.util.LocalizedFormats;
022import org.apache.commons.math3.util.FastMath;
023import org.apache.commons.math3.random.RandomGenerator;
024import org.apache.commons.math3.random.Well19937c;
025
026/**
027 * Implementation of the Zipf distribution.
028 *
029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
030 */
031public class ZipfDistribution extends AbstractIntegerDistribution {
032    /** Serializable version identifier. */
033    private static final long serialVersionUID = -140627372283420404L;
034    /** Number of elements. */
035    private final int numberOfElements;
036    /** Exponent parameter of the distribution. */
037    private final double exponent;
038    /** Cached numerical mean */
039    private double numericalMean = Double.NaN;
040    /** Whether or not the numerical mean has been calculated */
041    private boolean numericalMeanIsCalculated = false;
042    /** Cached numerical variance */
043    private double numericalVariance = Double.NaN;
044    /** Whether or not the numerical variance has been calculated */
045    private boolean numericalVarianceIsCalculated = false;
046
047    /**
048     * Create a new Zipf distribution with the given number of elements and
049     * exponent.
050     *
051     * @param numberOfElements Number of elements.
052     * @param exponent Exponent.
053     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
054     * or {@code exponent <= 0}.
055     */
056    public ZipfDistribution(final int numberOfElements, final double exponent) {
057        this(new Well19937c(), numberOfElements, exponent);
058    }
059
060    /**
061     * Creates a Zipf distribution.
062     *
063     * @param rng Random number generator.
064     * @param numberOfElements Number of elements.
065     * @param exponent Exponent.
066     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
067     * or {@code exponent <= 0}.
068     * @since 3.1
069     */
070    public ZipfDistribution(RandomGenerator rng,
071                            int numberOfElements,
072                            double exponent)
073        throws NotStrictlyPositiveException {
074        super(rng);
075
076        if (numberOfElements <= 0) {
077            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
078                                                   numberOfElements);
079        }
080        if (exponent <= 0) {
081            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
082                                                   exponent);
083        }
084
085        this.numberOfElements = numberOfElements;
086        this.exponent = exponent;
087    }
088
089    /**
090     * Get the number of elements (e.g. corpus size) for the distribution.
091     *
092     * @return the number of elements
093     */
094    public int getNumberOfElements() {
095        return numberOfElements;
096    }
097
098    /**
099     * Get the exponent characterizing the distribution.
100     *
101     * @return the exponent
102     */
103    public double getExponent() {
104        return exponent;
105    }
106
107    /** {@inheritDoc} */
108    public double probability(final int x) {
109        if (x <= 0 || x > numberOfElements) {
110            return 0.0;
111        }
112
113        return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
114    }
115
116    /** {@inheritDoc} */
117    @Override
118    public double logProbability(int x) {
119        if (x <= 0 || x > numberOfElements) {
120            return Double.NEGATIVE_INFINITY;
121        }
122
123        return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
124    }
125
126    /** {@inheritDoc} */
127    public double cumulativeProbability(final int x) {
128        if (x <= 0) {
129            return 0.0;
130        } else if (x >= numberOfElements) {
131            return 1.0;
132        }
133
134        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
135    }
136
137    /**
138     * {@inheritDoc}
139     *
140     * For number of elements {@code N} and exponent {@code s}, the mean is
141     * {@code Hs1 / Hs}, where
142     * <ul>
143     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
144     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
145     * </ul>
146     */
147    public double getNumericalMean() {
148        if (!numericalMeanIsCalculated) {
149            numericalMean = calculateNumericalMean();
150            numericalMeanIsCalculated = true;
151        }
152        return numericalMean;
153    }
154
155    /**
156     * Used by {@link #getNumericalMean()}.
157     *
158     * @return the mean of this distribution
159     */
160    protected double calculateNumericalMean() {
161        final int N = getNumberOfElements();
162        final double s = getExponent();
163
164        final double Hs1 = generalizedHarmonic(N, s - 1);
165        final double Hs = generalizedHarmonic(N, s);
166
167        return Hs1 / Hs;
168    }
169
170    /**
171     * {@inheritDoc}
172     *
173     * For number of elements {@code N} and exponent {@code s}, the mean is
174     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
175     * <ul>
176     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
177     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
178     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
179     * </ul>
180     */
181    public double getNumericalVariance() {
182        if (!numericalVarianceIsCalculated) {
183            numericalVariance = calculateNumericalVariance();
184            numericalVarianceIsCalculated = true;
185        }
186        return numericalVariance;
187    }
188
189    /**
190     * Used by {@link #getNumericalVariance()}.
191     *
192     * @return the variance of this distribution
193     */
194    protected double calculateNumericalVariance() {
195        final int N = getNumberOfElements();
196        final double s = getExponent();
197
198        final double Hs2 = generalizedHarmonic(N, s - 2);
199        final double Hs1 = generalizedHarmonic(N, s - 1);
200        final double Hs = generalizedHarmonic(N, s);
201
202        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
203    }
204
205    /**
206     * Calculates the Nth generalized harmonic number. See
207     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
208     * Series</a>.
209     *
210     * @param n Term in the series to calculate (must be larger than 1)
211     * @param m Exponent (special case {@code m = 1} is the harmonic series).
212     * @return the n<sup>th</sup> generalized harmonic number.
213     */
214    private double generalizedHarmonic(final int n, final double m) {
215        double value = 0;
216        for (int k = n; k > 0; --k) {
217            value += 1.0 / FastMath.pow(k, m);
218        }
219        return value;
220    }
221
222    /**
223     * {@inheritDoc}
224     *
225     * The lower bound of the support is always 1 no matter the parameters.
226     *
227     * @return lower bound of the support (always 1)
228     */
229    public int getSupportLowerBound() {
230        return 1;
231    }
232
233    /**
234     * {@inheritDoc}
235     *
236     * The upper bound of the support is the number of elements.
237     *
238     * @return upper bound of the support
239     */
240    public int getSupportUpperBound() {
241        return getNumberOfElements();
242    }
243
244    /**
245     * {@inheritDoc}
246     *
247     * The support of this distribution is connected.
248     *
249     * @return {@code true}
250     */
251    public boolean isSupportConnected() {
252        return true;
253    }
254}
255