001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021import org.apache.commons.math3.exception.util.LocalizedFormats;
022import org.apache.commons.math3.util.FastMath;
023import org.apache.commons.math3.random.RandomGenerator;
024import org.apache.commons.math3.random.Well19937c;
025
026/**
027 * Implementation of the Zipf distribution.
028 *
029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
030 * @version $Id: ZipfDistribution.java 1533974 2013-10-20 20:42:41Z psteitz $
031 */
032public class ZipfDistribution extends AbstractIntegerDistribution {
033    /** Serializable version identifier. */
034    private static final long serialVersionUID = -140627372283420404L;
035    /** Number of elements. */
036    private final int numberOfElements;
037    /** Exponent parameter of the distribution. */
038    private final double exponent;
039    /** Cached numerical mean */
040    private double numericalMean = Double.NaN;
041    /** Whether or not the numerical mean has been calculated */
042    private boolean numericalMeanIsCalculated = false;
043    /** Cached numerical variance */
044    private double numericalVariance = Double.NaN;
045    /** Whether or not the numerical variance has been calculated */
046    private boolean numericalVarianceIsCalculated = false;
047
048    /**
049     * Create a new Zipf distribution with the given number of elements and
050     * exponent.
051     *
052     * @param numberOfElements Number of elements.
053     * @param exponent Exponent.
054     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
055     * or {@code exponent <= 0}.
056     */
057    public ZipfDistribution(final int numberOfElements, final double exponent) {
058        this(new Well19937c(), numberOfElements, exponent);
059    }
060
061    /**
062     * Creates a Zipf distribution.
063     *
064     * @param rng Random number generator.
065     * @param numberOfElements Number of elements.
066     * @param exponent Exponent.
067     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
068     * or {@code exponent <= 0}.
069     * @since 3.1
070     */
071    public ZipfDistribution(RandomGenerator rng,
072                            int numberOfElements,
073                            double exponent)
074        throws NotStrictlyPositiveException {
075        super(rng);
076
077        if (numberOfElements <= 0) {
078            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
079                                                   numberOfElements);
080        }
081        if (exponent <= 0) {
082            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
083                                                   exponent);
084        }
085
086        this.numberOfElements = numberOfElements;
087        this.exponent = exponent;
088    }
089
090    /**
091     * Get the number of elements (e.g. corpus size) for the distribution.
092     *
093     * @return the number of elements
094     */
095    public int getNumberOfElements() {
096        return numberOfElements;
097    }
098
099    /**
100     * Get the exponent characterizing the distribution.
101     *
102     * @return the exponent
103     */
104    public double getExponent() {
105        return exponent;
106    }
107
108    /** {@inheritDoc} */
109    public double probability(final int x) {
110        if (x <= 0 || x > numberOfElements) {
111            return 0.0;
112        }
113
114        return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
115    }
116
117    /** {@inheritDoc} */
118    @Override
119    public double logProbability(int x) {
120        if (x <= 0 || x > numberOfElements) {
121            return Double.NEGATIVE_INFINITY;
122        }
123
124        return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
125    }
126
127    /** {@inheritDoc} */
128    public double cumulativeProbability(final int x) {
129        if (x <= 0) {
130            return 0.0;
131        } else if (x >= numberOfElements) {
132            return 1.0;
133        }
134
135        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
136    }
137
138    /**
139     * {@inheritDoc}
140     *
141     * For number of elements {@code N} and exponent {@code s}, the mean is
142     * {@code Hs1 / Hs}, where
143     * <ul>
144     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
145     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
146     * </ul>
147     */
148    public double getNumericalMean() {
149        if (!numericalMeanIsCalculated) {
150            numericalMean = calculateNumericalMean();
151            numericalMeanIsCalculated = true;
152        }
153        return numericalMean;
154    }
155
156    /**
157     * Used by {@link #getNumericalMean()}.
158     *
159     * @return the mean of this distribution
160     */
161    protected double calculateNumericalMean() {
162        final int N = getNumberOfElements();
163        final double s = getExponent();
164
165        final double Hs1 = generalizedHarmonic(N, s - 1);
166        final double Hs = generalizedHarmonic(N, s);
167
168        return Hs1 / Hs;
169    }
170
171    /**
172     * {@inheritDoc}
173     *
174     * For number of elements {@code N} and exponent {@code s}, the mean is
175     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
176     * <ul>
177     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
178     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
179     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
180     * </ul>
181     */
182    public double getNumericalVariance() {
183        if (!numericalVarianceIsCalculated) {
184            numericalVariance = calculateNumericalVariance();
185            numericalVarianceIsCalculated = true;
186        }
187        return numericalVariance;
188    }
189
190    /**
191     * Used by {@link #getNumericalVariance()}.
192     *
193     * @return the variance of this distribution
194     */
195    protected double calculateNumericalVariance() {
196        final int N = getNumberOfElements();
197        final double s = getExponent();
198
199        final double Hs2 = generalizedHarmonic(N, s - 2);
200        final double Hs1 = generalizedHarmonic(N, s - 1);
201        final double Hs = generalizedHarmonic(N, s);
202
203        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
204    }
205
206    /**
207     * Calculates the Nth generalized harmonic number. See
208     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
209     * Series</a>.
210     *
211     * @param n Term in the series to calculate (must be larger than 1)
212     * @param m Exponent (special case {@code m = 1} is the harmonic series).
213     * @return the n<sup>th</sup> generalized harmonic number.
214     */
215    private double generalizedHarmonic(final int n, final double m) {
216        double value = 0;
217        for (int k = n; k > 0; --k) {
218            value += 1.0 / FastMath.pow(k, m);
219        }
220        return value;
221    }
222
223    /**
224     * {@inheritDoc}
225     *
226     * The lower bound of the support is always 1 no matter the parameters.
227     *
228     * @return lower bound of the support (always 1)
229     */
230    public int getSupportLowerBound() {
231        return 1;
232    }
233
234    /**
235     * {@inheritDoc}
236     *
237     * The upper bound of the support is the number of elements.
238     *
239     * @return upper bound of the support
240     */
241    public int getSupportUpperBound() {
242        return getNumberOfElements();
243    }
244
245    /**
246     * {@inheritDoc}
247     *
248     * The support of this distribution is connected.
249     *
250     * @return {@code true}
251     */
252    public boolean isSupportConnected() {
253        return true;
254    }
255}
256