001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math3.distribution; 019 020import org.apache.commons.math3.exception.NotStrictlyPositiveException; 021import org.apache.commons.math3.exception.util.LocalizedFormats; 022import org.apache.commons.math3.random.RandomGenerator; 023import org.apache.commons.math3.random.Well19937c; 024import org.apache.commons.math3.util.FastMath; 025 026/** 027 * Implementation of the Zipf distribution. 028 * <p> 029 * <strong>Parameters:</strong> 030 * For a random variable {@code X} whose values are distributed according to this 031 * distribution, the probability mass function is given by 032 * <pre> 033 * P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}. 034 * </pre> 035 * {@code H(N,s)} is the normalizing constant 036 * which corresponds to the generalized harmonic number of order N of s. 037 * <p> 038 * <ul> 039 * <li>{@code N} is the number of elements</li> 040 * <li>{@code s} is the exponent</li> 041 * </ul> 042 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a> 043 * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a> 044 */ 045public class ZipfDistribution extends AbstractIntegerDistribution { 046 /** Serializable version identifier. */ 047 private static final long serialVersionUID = -140627372283420404L; 048 /** Number of elements. */ 049 private final int numberOfElements; 050 /** Exponent parameter of the distribution. */ 051 private final double exponent; 052 /** Cached numerical mean */ 053 private double numericalMean = Double.NaN; 054 /** Whether or not the numerical mean has been calculated */ 055 private boolean numericalMeanIsCalculated = false; 056 /** Cached numerical variance */ 057 private double numericalVariance = Double.NaN; 058 /** Whether or not the numerical variance has been calculated */ 059 private boolean numericalVarianceIsCalculated = false; 060 /** The sampler to be used for the sample() method */ 061 private transient ZipfRejectionInversionSampler sampler; 062 063 /** 064 * Create a new Zipf distribution with the given number of elements and 065 * exponent. 066 * <p> 067 * <b>Note:</b> this constructor will implicitly create an instance of 068 * {@link Well19937c} as random generator to be used for sampling only (see 069 * {@link #sample()} and {@link #sample(int)}). In case no sampling is 070 * needed for the created distribution, it is advised to pass {@code null} 071 * as random generator via the appropriate constructors to avoid the 072 * additional initialisation overhead. 073 * 074 * @param numberOfElements Number of elements. 075 * @param exponent Exponent. 076 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} 077 * or {@code exponent <= 0}. 078 */ 079 public ZipfDistribution(final int numberOfElements, final double exponent) { 080 this(new Well19937c(), numberOfElements, exponent); 081 } 082 083 /** 084 * Creates a Zipf distribution. 085 * 086 * @param rng Random number generator. 087 * @param numberOfElements Number of elements. 088 * @param exponent Exponent. 089 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} 090 * or {@code exponent <= 0}. 091 * @since 3.1 092 */ 093 public ZipfDistribution(RandomGenerator rng, 094 int numberOfElements, 095 double exponent) 096 throws NotStrictlyPositiveException { 097 super(rng); 098 099 if (numberOfElements <= 0) { 100 throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION, 101 numberOfElements); 102 } 103 if (exponent <= 0) { 104 throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT, 105 exponent); 106 } 107 108 this.numberOfElements = numberOfElements; 109 this.exponent = exponent; 110 } 111 112 /** 113 * Get the number of elements (e.g. corpus size) for the distribution. 114 * 115 * @return the number of elements 116 */ 117 public int getNumberOfElements() { 118 return numberOfElements; 119 } 120 121 /** 122 * Get the exponent characterizing the distribution. 123 * 124 * @return the exponent 125 */ 126 public double getExponent() { 127 return exponent; 128 } 129 130 /** {@inheritDoc} */ 131 public double probability(final int x) { 132 if (x <= 0 || x > numberOfElements) { 133 return 0.0; 134 } 135 136 return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent); 137 } 138 139 /** {@inheritDoc} */ 140 @Override 141 public double logProbability(int x) { 142 if (x <= 0 || x > numberOfElements) { 143 return Double.NEGATIVE_INFINITY; 144 } 145 146 return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent)); 147 } 148 149 /** {@inheritDoc} */ 150 public double cumulativeProbability(final int x) { 151 if (x <= 0) { 152 return 0.0; 153 } else if (x >= numberOfElements) { 154 return 1.0; 155 } 156 157 return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent); 158 } 159 160 /** 161 * {@inheritDoc} 162 * 163 * For number of elements {@code N} and exponent {@code s}, the mean is 164 * {@code Hs1 / Hs}, where 165 * <ul> 166 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 167 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 168 * </ul> 169 */ 170 public double getNumericalMean() { 171 if (!numericalMeanIsCalculated) { 172 numericalMean = calculateNumericalMean(); 173 numericalMeanIsCalculated = true; 174 } 175 return numericalMean; 176 } 177 178 /** 179 * Used by {@link #getNumericalMean()}. 180 * 181 * @return the mean of this distribution 182 */ 183 protected double calculateNumericalMean() { 184 final int N = getNumberOfElements(); 185 final double s = getExponent(); 186 187 final double Hs1 = generalizedHarmonic(N, s - 1); 188 final double Hs = generalizedHarmonic(N, s); 189 190 return Hs1 / Hs; 191 } 192 193 /** 194 * {@inheritDoc} 195 * 196 * For number of elements {@code N} and exponent {@code s}, the mean is 197 * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where 198 * <ul> 199 * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li> 200 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 201 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 202 * </ul> 203 */ 204 public double getNumericalVariance() { 205 if (!numericalVarianceIsCalculated) { 206 numericalVariance = calculateNumericalVariance(); 207 numericalVarianceIsCalculated = true; 208 } 209 return numericalVariance; 210 } 211 212 /** 213 * Used by {@link #getNumericalVariance()}. 214 * 215 * @return the variance of this distribution 216 */ 217 protected double calculateNumericalVariance() { 218 final int N = getNumberOfElements(); 219 final double s = getExponent(); 220 221 final double Hs2 = generalizedHarmonic(N, s - 2); 222 final double Hs1 = generalizedHarmonic(N, s - 1); 223 final double Hs = generalizedHarmonic(N, s); 224 225 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); 226 } 227 228 /** 229 * Calculates the Nth generalized harmonic number. See 230 * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic 231 * Series</a>. 232 * 233 * @param n Term in the series to calculate (must be larger than 1) 234 * @param m Exponent (special case {@code m = 1} is the harmonic series). 235 * @return the n<sup>th</sup> generalized harmonic number. 236 */ 237 private double generalizedHarmonic(final int n, final double m) { 238 double value = 0; 239 for (int k = n; k > 0; --k) { 240 value += 1.0 / FastMath.pow(k, m); 241 } 242 return value; 243 } 244 245 /** 246 * {@inheritDoc} 247 * 248 * The lower bound of the support is always 1 no matter the parameters. 249 * 250 * @return lower bound of the support (always 1) 251 */ 252 public int getSupportLowerBound() { 253 return 1; 254 } 255 256 /** 257 * {@inheritDoc} 258 * 259 * The upper bound of the support is the number of elements. 260 * 261 * @return upper bound of the support 262 */ 263 public int getSupportUpperBound() { 264 return getNumberOfElements(); 265 } 266 267 /** 268 * {@inheritDoc} 269 * 270 * The support of this distribution is connected. 271 * 272 * @return {@code true} 273 */ 274 public boolean isSupportConnected() { 275 return true; 276 } 277 278 /** 279 * {@inheritDoc} 280 */ 281 @Override 282 public int sample() { 283 if (sampler == null) { 284 sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent); 285 } 286 return sampler.sample(random); 287 } 288 289 /** 290 * Utility class implementing a rejection inversion sampling method for a discrete, 291 * bounded Zipf distribution that is based on the method described in 292 * <p> 293 * Wolfgang Hörmann and Gerhard Derflinger 294 * "Rejection-inversion to generate variates from monotone discrete distributions." 295 * ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996): 169-184. 296 * <p> 297 * The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). 298 * The original method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)} 299 * as the integral of the hat function. This function is undefined for 300 * q = 1, which is the reason for the limitation of the exponent. 301 * If instead the integral function 302 * {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used, 303 * for which a meaningful limit exists for q = 1, 304 * the method works for all positive exponents. 305 * <p> 306 * The following implementation uses v := 0 and generates integral numbers 307 * in the range [1, numberOfElements]. This is different to the original method 308 * where v is defined to be positive and numbers are taken from [0, i_max]. 309 * This explains why the implementation looks slightly different. 310 * 311 * @since 3.6 312 */ 313 static final class ZipfRejectionInversionSampler { 314 315 /** Exponent parameter of the distribution. */ 316 private final double exponent; 317 /** Number of elements. */ 318 private final int numberOfElements; 319 /** Constant equal to {@code hIntegral(1.5) - 1}. */ 320 private final double hIntegralX1; 321 /** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */ 322 private final double hIntegralNumberOfElements; 323 /** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */ 324 private final double s; 325 326 /** Simple constructor. 327 * @param numberOfElements number of elements 328 * @param exponent exponent parameter of the distribution 329 */ 330 ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) { 331 this.exponent = exponent; 332 this.numberOfElements = numberOfElements; 333 this.hIntegralX1 = hIntegral(1.5) - 1d; 334 this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5); 335 this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2)); 336 } 337 338 /** Generate one integral number in the range [1, numberOfElements]. 339 * @param random random generator to use 340 * @return generated integral number in the range [1, numberOfElements] 341 */ 342 int sample(final RandomGenerator random) { 343 while(true) { 344 345 final double u = hIntegralNumberOfElements + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements); 346 // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements] 347 348 double x = hIntegralInverse(u); 349 350 int k = (int)(x + 0.5); 351 352 // Limit k to the range [1, numberOfElements] 353 // (k could be outside due to numerical inaccuracies) 354 if (k < 1) { 355 k = 1; 356 } 357 else if (k > numberOfElements) { 358 k = numberOfElements; 359 } 360 361 // Here, the distribution of k is given by: 362 // 363 // P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C 364 // P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2 365 // 366 // where C := 1 / (hIntegralNumberOfElements - hIntegralX1) 367 368 if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) { 369 370 // Case k = 1: 371 // 372 // The right inequality is always true, because replacing k by 1 gives 373 // u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from 374 // (hIntegralX1, hIntegralNumberOfElements]. 375 // 376 // Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 377 // and the probability that 1 is returned as random value is 378 // P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent 379 // 380 // Case k >= 2: 381 // 382 // The left inequality (k - x <= s) is just a short cut 383 // to avoid the more expensive evaluation of the right inequality 384 // (u >= hIntegral(k + 0.5) - h(k)) in many cases. 385 // 386 // If the left inequality is true, the right inequality is also true: 387 // Theorem 2 in the paper is valid for all positive exponents, because 388 // the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and 389 // (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 390 // are both fulfilled. 391 // Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x)) 392 // is a non-decreasing function. If k - x <= s holds, 393 // k - x <= s + f(k) - f(2) is obviously also true which is equivalent to 394 // -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), 395 // -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), 396 // and finally u >= hIntegral(k + 0.5) - h(k). 397 // 398 // Hence, the right inequality determines the acceptance rate: 399 // P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) 400 // The probability that m is returned is given by 401 // P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent. 402 // 403 // In both cases the probabilities are proportional to the probability mass function 404 // of the Zipf distribution. 405 406 return k; 407 } 408 } 409 } 410 411 /** 412 * {@code H(x) :=} 413 * <ul> 414 * <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}</li> 415 * <li>{@code log(x)}, if {@code exponent == 1}</li> 416 * </ul> 417 * H(x) is an integral function of h(x), 418 * the derivative of H(x) is h(x). 419 * 420 * @param x free parameter 421 * @return {@code H(x)} 422 */ 423 private double hIntegral(final double x) { 424 final double logX = FastMath.log(x); 425 return helper2((1d-exponent)*logX)*logX; 426 } 427 428 /** 429 * {@code h(x) := 1/x^exponent} 430 * 431 * @param x free parameter 432 * @return h(x) 433 */ 434 private double h(final double x) { 435 return FastMath.exp(-exponent * FastMath.log(x)); 436 } 437 438 /** 439 * The inverse function of H(x). 440 * 441 * @param x free parameter 442 * @return y for which {@code H(y) = x} 443 */ 444 private double hIntegralInverse(final double x) { 445 double t = x*(1d-exponent); 446 if (t < -1d) { 447 // Limit value to the range [-1, +inf). 448 // t could be smaller than -1 in some rare cases due to numerical errors. 449 t = -1; 450 } 451 return FastMath.exp(helper1(t)*x); 452 } 453 454 /** 455 * Helper function that calculates {@code log(1+x)/x}. 456 * <p> 457 * A Taylor series expansion is used, if x is close to 0. 458 * 459 * @param x a value larger than or equal to -1 460 * @return {@code log(1+x)/x} 461 */ 462 static double helper1(final double x) { 463 if (FastMath.abs(x)>1e-8) { 464 return FastMath.log1p(x)/x; 465 } 466 else { 467 return 1.-x*((1./2.)-x*((1./3.)-x*(1./4.))); 468 } 469 } 470 471 /** 472 * Helper function to calculate {@code (exp(x)-1)/x}. 473 * <p> 474 * A Taylor series expansion is used, if x is close to 0. 475 * 476 * @param x free parameter 477 * @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0 478 */ 479 static double helper2(final double x) { 480 if (FastMath.abs(x)>1e-8) { 481 return FastMath.expm1(x)/x; 482 } 483 else { 484 return 1.+x*(1./2.)*(1.+x*(1./3.)*(1.+x*(1./4.))); 485 } 486 } 487 } 488}