1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.statistics.distribution;
19
20 import org.apache.commons.rng.UniformRandomProvider;
21 import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
22
23 /**
24 * Implementation of the Zipf distribution.
25 *
26 * <p>The probability mass function of \( X \) is:
27 *
28 * <p>\[ f(k; N, s) = \frac{1/k^s}{H_{N,s}} \]
29 *
30 * <p>for \( N \in \{1, 2, 3, \dots\} \) the number of elements,
31 * \( s \gt 0 \) the exponent characterizing the distribution,
32 * \( k \in \{1, 2, \dots, N\} \) the element rank, and
33 * \( H_{N,s} \) is the normalizing constant which corresponds to the
34 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
35 * generalized harmonic number</a> of order N of s.
36 *
37 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution (Wikipedia)</a>
38 */
39 public final class ZipfDistribution extends AbstractDiscreteDistribution {
40 /** Number of elements. */
41 private final int numberOfElements;
42 /** Exponent parameter of the distribution. */
43 private final double exponent;
44 /** Cached value of the nth generalized harmonic. */
45 private final double nthHarmonic;
46 /** Cached value of the log of the nth generalized harmonic. */
47 private final double logNthHarmonic;
48
49 /**
50 * @param numberOfElements Number of elements.
51 * @param exponent Exponent.
52 */
53 private ZipfDistribution(int numberOfElements,
54 double exponent) {
55 this.numberOfElements = numberOfElements;
56 this.exponent = exponent;
57 this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
58 logNthHarmonic = Math.log(nthHarmonic);
59 }
60
61 /**
62 * Creates a Zipf distribution.
63 *
64 * @param numberOfElements Number of elements.
65 * @param exponent Exponent.
66 * @return the distribution
67 * @exception IllegalArgumentException if {@code numberOfElements <= 0}
68 * or {@code exponent <= 0}.
69 */
70 public static ZipfDistribution of(int numberOfElements,
71 double exponent) {
72 if (numberOfElements <= 0) {
73 throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE,
74 numberOfElements);
75 }
76 if (exponent < 0) {
77 throw new DistributionException(DistributionException.NEGATIVE,
78 exponent);
79 }
80 return new ZipfDistribution(numberOfElements, exponent);
81 }
82
83 /**
84 * Gets the number of elements parameter of this distribution.
85 *
86 * @return the number of elements.
87 */
88 public int getNumberOfElements() {
89 return numberOfElements;
90 }
91
92 /**
93 * Gets the exponent parameter of this distribution.
94 *
95 * @return the exponent.
96 */
97 public double getExponent() {
98 return exponent;
99 }
100
101 /** {@inheritDoc} */
102 @Override
103 public double probability(final int x) {
104 if (x <= 0 || x > numberOfElements) {
105 return 0;
106 }
107
108 return Math.pow(x, -exponent) / nthHarmonic;
109 }
110
111 /** {@inheritDoc} */
112 @Override
113 public double logProbability(int x) {
114 if (x <= 0 || x > numberOfElements) {
115 return Double.NEGATIVE_INFINITY;
116 }
117
118 return -Math.log(x) * exponent - logNthHarmonic;
119 }
120
121 /** {@inheritDoc} */
122 @Override
123 public double cumulativeProbability(final int x) {
124 if (x <= 0) {
125 return 0;
126 } else if (x >= numberOfElements) {
127 return 1;
128 }
129
130 return generalizedHarmonic(x, exponent) / nthHarmonic;
131 }
132
133 /** {@inheritDoc} */
134 @Override
135 public double survivalProbability(int x) {
136 if (x <= 0) {
137 return 1;
138 } else if (x >= numberOfElements) {
139 return 0;
140 }
141
142 // See http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
143 // S(x) = P(X > x) = ((x+1)^a Hn,a - (x+1)^a Hx+1,a + 1) / ((x+1)^a Hn,a)
144 // where a = exponent and Hx,a is the generalized harmonic for x with exponent a.
145 final double z = Math.pow(x + 1.0, exponent);
146 // Compute generalizedHarmonic(x, exponent) and generalizedHarmonic(x+1, exponent)
147 final double hx = generalizedHarmonic(x, exponent);
148 final double hx1 = hx + Math.pow(x + 1.0, -exponent);
149 // Compute the survival function
150 final double p = (z * (nthHarmonic - hx1) + 1) / (z * nthHarmonic);
151 // May overflow for large exponent so validate the probability.
152 // If this occurs revert to 1 - CDF(x), reusing the generalized harmonic for x
153 return p <= 1.0 ? p : 1.0 - hx / nthHarmonic;
154 }
155
156 /**
157 * {@inheritDoc}
158 *
159 * <p>For number of elements \( N \) and exponent \( s \), the mean is:
160 *
161 * <p>\[ \frac{H_{N,s-1}}{H_{N,s}} \]
162 *
163 * <p>where \( H_{N,k} \) is the
164 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
165 * generalized harmonic number</a> of order \( N \) of \( k \).
166 */
167 @Override
168 public double getMean() {
169 final int N = getNumberOfElements();
170 final double s = getExponent();
171
172 final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
173
174 return Hs1 / nthHarmonic;
175 }
176
177 /**
178 * {@inheritDoc}
179 *
180 * <p>For number of elements \( N \) and exponent \( s \), the variance is:
181 *
182 * <p>\[ \frac{H_{N,s-2}}{H_{N,s}} - \frac{H_{N,s-1}^2}{H_{N,s}^2} \]
183 *
184 * <p>where \( H_{N,k} \) is the
185 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
186 * generalized harmonic number</a> of order \( N \) of \( k \).
187 */
188 @Override
189 public double getVariance() {
190 final int N = getNumberOfElements();
191 final double s = getExponent();
192
193 final double Hs2 = generalizedHarmonicAscendingSum(N, s - 2);
194 final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
195 final double Hs = nthHarmonic;
196
197 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
198 }
199
200 /**
201 * Calculates the Nth generalized harmonic number. See
202 * <a href="https://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
203 * Series</a>.
204 *
205 * <p>Assumes {@code exponent > 0} to arrange the terms to sum from small to large.
206 *
207 * @param n Term in the series to calculate (must be larger than 1)
208 * @param m Exponent (special case {@code m = 1} is the harmonic series).
209 * @return the n<sup>th</sup> generalized harmonic number.
210 */
211 private static double generalizedHarmonic(final int n, final double m) {
212 double value = 0;
213 // Sum small to large
214 for (int k = n; k >= 1; k--) {
215 value += Math.pow(k, -m);
216 }
217 return value;
218 }
219
220 /**
221 * Calculates the Nth generalized harmonic number.
222 *
223 * <p>Checks the value of the {@code exponent} to arrange the terms to sum from from small to large.
224 *
225 * @param n Term in the series to calculate (must be larger than 1)
226 * @param m Exponent (special case {@code m = 1} is the harmonic series).
227 * @return the n<sup>th</sup> generalized harmonic number.
228 */
229 private static double generalizedHarmonicAscendingSum(final int n, final double m) {
230 double value = 0;
231 // Sum small to large
232 // If m < 0 then sum ascending, otherwise descending
233 if (m < 0) {
234 for (int k = 1; k <= n; k++) {
235 value += Math.pow(k, -m);
236 }
237 } else {
238 for (int k = n; k >= 1; k--) {
239 value += Math.pow(k, -m);
240 }
241 }
242 return value;
243 }
244
245 /**
246 * {@inheritDoc}
247 *
248 * <p>The lower bound of the support is always 1.
249 *
250 * @return 1.
251 */
252 @Override
253 public int getSupportLowerBound() {
254 return 1;
255 }
256
257 /**
258 * {@inheritDoc}
259 *
260 * <p>The upper bound of the support is the number of elements.
261 *
262 * @return number of elements.
263 */
264 @Override
265 public int getSupportUpperBound() {
266 return getNumberOfElements();
267 }
268
269 /** {@inheritDoc} */
270 @Override
271 public DiscreteDistribution.Sampler createSampler(final UniformRandomProvider rng) {
272 // Zipf distribution sampler.
273 return RejectionInversionZipfSampler.of(rng, numberOfElements, exponent)::sample;
274 }
275 }