001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021import org.apache.commons.math3.exception.util.LocalizedFormats;
022import org.apache.commons.math3.random.RandomGenerator;
023import org.apache.commons.math3.random.Well19937c;
024import org.apache.commons.math3.util.FastMath;
025
026/**
027 * Implementation of the Zipf distribution.
028 *
029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
030 */
031public class ZipfDistribution extends AbstractIntegerDistribution {
032    /** Serializable version identifier. */
033    private static final long serialVersionUID = -140627372283420404L;
034    /** Number of elements. */
035    private final int numberOfElements;
036    /** Exponent parameter of the distribution. */
037    private final double exponent;
038    /** Cached numerical mean */
039    private double numericalMean = Double.NaN;
040    /** Whether or not the numerical mean has been calculated */
041    private boolean numericalMeanIsCalculated = false;
042    /** Cached numerical variance */
043    private double numericalVariance = Double.NaN;
044    /** Whether or not the numerical variance has been calculated */
045    private boolean numericalVarianceIsCalculated = false;
046
047    /**
048     * Create a new Zipf distribution with the given number of elements and
049     * exponent.
050     * <p>
051     * <b>Note:</b> this constructor will implicitly create an instance of
052     * {@link Well19937c} as random generator to be used for sampling only (see
053     * {@link #sample()} and {@link #sample(int)}). In case no sampling is
054     * needed for the created distribution, it is advised to pass {@code null}
055     * as random generator via the appropriate constructors to avoid the
056     * additional initialisation overhead.
057     *
058     * @param numberOfElements Number of elements.
059     * @param exponent Exponent.
060     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
061     * or {@code exponent <= 0}.
062     */
063    public ZipfDistribution(final int numberOfElements, final double exponent) {
064        this(new Well19937c(), numberOfElements, exponent);
065    }
066
067    /**
068     * Creates a Zipf distribution.
069     *
070     * @param rng Random number generator.
071     * @param numberOfElements Number of elements.
072     * @param exponent Exponent.
073     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
074     * or {@code exponent <= 0}.
075     * @since 3.1
076     */
077    public ZipfDistribution(RandomGenerator rng,
078                            int numberOfElements,
079                            double exponent)
080        throws NotStrictlyPositiveException {
081        super(rng);
082
083        if (numberOfElements <= 0) {
084            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
085                                                   numberOfElements);
086        }
087        if (exponent <= 0) {
088            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
089                                                   exponent);
090        }
091
092        this.numberOfElements = numberOfElements;
093        this.exponent = exponent;
094    }
095
096    /**
097     * Get the number of elements (e.g. corpus size) for the distribution.
098     *
099     * @return the number of elements
100     */
101    public int getNumberOfElements() {
102        return numberOfElements;
103    }
104
105    /**
106     * Get the exponent characterizing the distribution.
107     *
108     * @return the exponent
109     */
110    public double getExponent() {
111        return exponent;
112    }
113
114    /** {@inheritDoc} */
115    public double probability(final int x) {
116        if (x <= 0 || x > numberOfElements) {
117            return 0.0;
118        }
119
120        return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
121    }
122
123    /** {@inheritDoc} */
124    @Override
125    public double logProbability(int x) {
126        if (x <= 0 || x > numberOfElements) {
127            return Double.NEGATIVE_INFINITY;
128        }
129
130        return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
131    }
132
133    /** {@inheritDoc} */
134    public double cumulativeProbability(final int x) {
135        if (x <= 0) {
136            return 0.0;
137        } else if (x >= numberOfElements) {
138            return 1.0;
139        }
140
141        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
142    }
143
144    /**
145     * {@inheritDoc}
146     *
147     * For number of elements {@code N} and exponent {@code s}, the mean is
148     * {@code Hs1 / Hs}, where
149     * <ul>
150     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
151     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
152     * </ul>
153     */
154    public double getNumericalMean() {
155        if (!numericalMeanIsCalculated) {
156            numericalMean = calculateNumericalMean();
157            numericalMeanIsCalculated = true;
158        }
159        return numericalMean;
160    }
161
162    /**
163     * Used by {@link #getNumericalMean()}.
164     *
165     * @return the mean of this distribution
166     */
167    protected double calculateNumericalMean() {
168        final int N = getNumberOfElements();
169        final double s = getExponent();
170
171        final double Hs1 = generalizedHarmonic(N, s - 1);
172        final double Hs = generalizedHarmonic(N, s);
173
174        return Hs1 / Hs;
175    }
176
177    /**
178     * {@inheritDoc}
179     *
180     * For number of elements {@code N} and exponent {@code s}, the mean is
181     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
182     * <ul>
183     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
184     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
185     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
186     * </ul>
187     */
188    public double getNumericalVariance() {
189        if (!numericalVarianceIsCalculated) {
190            numericalVariance = calculateNumericalVariance();
191            numericalVarianceIsCalculated = true;
192        }
193        return numericalVariance;
194    }
195
196    /**
197     * Used by {@link #getNumericalVariance()}.
198     *
199     * @return the variance of this distribution
200     */
201    protected double calculateNumericalVariance() {
202        final int N = getNumberOfElements();
203        final double s = getExponent();
204
205        final double Hs2 = generalizedHarmonic(N, s - 2);
206        final double Hs1 = generalizedHarmonic(N, s - 1);
207        final double Hs = generalizedHarmonic(N, s);
208
209        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
210    }
211
212    /**
213     * Calculates the Nth generalized harmonic number. See
214     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
215     * Series</a>.
216     *
217     * @param n Term in the series to calculate (must be larger than 1)
218     * @param m Exponent (special case {@code m = 1} is the harmonic series).
219     * @return the n<sup>th</sup> generalized harmonic number.
220     */
221    private double generalizedHarmonic(final int n, final double m) {
222        double value = 0;
223        for (int k = n; k > 0; --k) {
224            value += 1.0 / FastMath.pow(k, m);
225        }
226        return value;
227    }
228
229    /**
230     * {@inheritDoc}
231     *
232     * The lower bound of the support is always 1 no matter the parameters.
233     *
234     * @return lower bound of the support (always 1)
235     */
236    public int getSupportLowerBound() {
237        return 1;
238    }
239
240    /**
241     * {@inheritDoc}
242     *
243     * The upper bound of the support is the number of elements.
244     *
245     * @return upper bound of the support
246     */
247    public int getSupportUpperBound() {
248        return getNumberOfElements();
249    }
250
251    /**
252     * {@inheritDoc}
253     *
254     * The support of this distribution is connected.
255     *
256     * @return {@code true}
257     */
258    public boolean isSupportConnected() {
259        return true;
260    }
261}
262