001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.rng.sampling.distribution; 019 020import org.apache.commons.rng.UniformRandomProvider; 021 022/** 023 * Implementation of the <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution</a>. 024 * 025 * <p>Sampling uses {@link UniformRandomProvider#nextDouble()}.</p> 026 * 027 * @since 1.0 028 */ 029public class RejectionInversionZipfSampler 030 extends SamplerBase 031 implements SharedStateDiscreteSampler { 032 /** Threshold below which Taylor series will be used. */ 033 private static final double TAYLOR_THRESHOLD = 1e-8; 034 /** 1/2. */ 035 private static final double F_1_2 = 0.5; 036 /** 1/3. */ 037 private static final double F_1_3 = 1d / 3; 038 /** 1/4. */ 039 private static final double F_1_4 = 0.25; 040 /** Number of elements. */ 041 private final int numberOfElements; 042 /** Exponent parameter of the distribution. */ 043 private final double exponent; 044 /** {@code hIntegral(1.5) - 1}. */ 045 private final double hIntegralX1; 046 /** {@code hIntegral(numberOfElements + 0.5)}. */ 047 private final double hIntegralNumberOfElements; 048 /** {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */ 049 private final double s; 050 /** Underlying source of randomness. */ 051 private final UniformRandomProvider rng; 052 053 /** 054 * @param rng Generator of uniformly distributed random numbers. 055 * @param numberOfElements Number of elements. 056 * @param exponent Exponent. 057 * @throws IllegalArgumentException if {@code numberOfElements <= 0} 058 * or {@code exponent <= 0}. 059 */ 060 public RejectionInversionZipfSampler(UniformRandomProvider rng, 061 int numberOfElements, 062 double exponent) { 063 super(null); 064 this.rng = rng; 065 if (numberOfElements <= 0) { 066 throw new IllegalArgumentException("number of elements is not strictly positive: " + numberOfElements); 067 } 068 if (exponent <= 0) { 069 throw new IllegalArgumentException("exponent is not strictly positive: " + exponent); 070 } 071 072 this.numberOfElements = numberOfElements; 073 this.exponent = exponent; 074 this.hIntegralX1 = hIntegral(1.5) - 1; 075 this.hIntegralNumberOfElements = hIntegral(numberOfElements + F_1_2); 076 this.s = 2 - hIntegralInverse(hIntegral(2.5) - h(2)); 077 } 078 079 /** 080 * @param rng Generator of uniformly distributed random numbers. 081 * @param source Source to copy. 082 */ 083 private RejectionInversionZipfSampler(UniformRandomProvider rng, 084 RejectionInversionZipfSampler source) { 085 super(null); 086 this.rng = rng; 087 this.numberOfElements = source.numberOfElements; 088 this.exponent = source.exponent; 089 this.hIntegralX1 = source.hIntegralX1; 090 this.hIntegralNumberOfElements = source.hIntegralNumberOfElements; 091 this.s = source.s; 092 } 093 094 /** 095 * Rejection inversion sampling method for a discrete, bounded Zipf 096 * distribution that is based on the method described in 097 * <blockquote> 098 * Wolfgang Hörmann and Gerhard Derflinger. 099 * <i>"Rejection-inversion to generate variates from monotone discrete 100 * distributions",</i><br> 101 * <strong>ACM Transactions on Modeling and Computer Simulation</strong> (TOMACS) 6.3 (1996): 169-184. 102 * </blockquote> 103 */ 104 @Override 105 public int sample() { 106 // The paper describes an algorithm for exponents larger than 1 107 // (Algorithm ZRI). 108 // The original method uses 109 // H(x) = (v + x)^(1 - q) / (1 - q) 110 // as the integral of the hat function. 111 // This function is undefined for q = 1, which is the reason for 112 // the limitation of the exponent. 113 // If instead the integral function 114 // H(x) = ((v + x)^(1 - q) - 1) / (1 - q) 115 // is used, 116 // for which a meaningful limit exists for q = 1, the method works 117 // for all positive exponents. 118 // The following implementation uses v = 0 and generates integral 119 // number in the range [1, numberOfElements]. 120 // This is different to the original method where v is defined to 121 // be positive and numbers are taken from [0, i_max]. 122 // This explains why the implementation looks slightly different. 123 124 while (true) { 125 final double u = hIntegralNumberOfElements + rng.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements); 126 // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements] 127 128 final double x = hIntegralInverse(u); 129 int k = (int) (x + F_1_2); 130 131 // Limit k to the range [1, numberOfElements] if it would be outside 132 // due to numerical inaccuracies. 133 if (k < 1) { 134 k = 1; 135 } else if (k > numberOfElements) { 136 k = numberOfElements; 137 } 138 139 // Here, the distribution of k is given by: 140 // 141 // P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C 142 // P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2 143 // 144 // where C = 1 / (hIntegralNumberOfElements - hIntegralX1) 145 146 if (k - x <= s || u >= hIntegral(k + F_1_2) - h(k)) { 147 148 // Case k = 1: 149 // 150 // The right inequality is always true, because replacing k by 1 gives 151 // u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from 152 // (hIntegralX1, hIntegralNumberOfElements]. 153 // 154 // Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 155 // and the probability that 1 is returned as random value is 156 // P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent 157 // 158 // Case k >= 2: 159 // 160 // The left inequality (k - x <= s) is just a short cut 161 // to avoid the more expensive evaluation of the right inequality 162 // (u >= hIntegral(k + 0.5) - h(k)) in many cases. 163 // 164 // If the left inequality is true, the right inequality is also true: 165 // Theorem 2 in the paper is valid for all positive exponents, because 166 // the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and 167 // (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 168 // are both fulfilled. 169 // Therefore, f(x) = x - hIntegralInverse(hIntegral(x + 0.5) - h(x)) 170 // is a non-decreasing function. If k - x <= s holds, 171 // k - x <= s + f(k) - f(2) is obviously also true which is equivalent to 172 // -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), 173 // -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), 174 // and finally u >= hIntegral(k + 0.5) - h(k). 175 // 176 // Hence, the right inequality determines the acceptance rate: 177 // P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) 178 // The probability that m is returned is given by 179 // P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent. 180 // 181 // In both cases the probabilities are proportional to the probability mass function 182 // of the Zipf distribution. 183 184 return k; 185 } 186 } 187 } 188 189 /** {@inheritDoc} */ 190 @Override 191 public String toString() { 192 return "Rejection inversion Zipf deviate [" + rng.toString() + "]"; 193 } 194 195 /** 196 * {@inheritDoc} 197 * 198 * @since 1.3 199 */ 200 @Override 201 public SharedStateDiscreteSampler withUniformRandomProvider(UniformRandomProvider rng) { 202 return new RejectionInversionZipfSampler(rng, this); 203 } 204 205 /** 206 * Creates a new Zipf distribution sampler. 207 * 208 * @param rng Generator of uniformly distributed random numbers. 209 * @param numberOfElements Number of elements. 210 * @param exponent Exponent. 211 * @return the sampler 212 * @throws IllegalArgumentException if {@code numberOfElements <= 0} or 213 * {@code exponent <= 0}. 214 * @since 1.3 215 */ 216 public static SharedStateDiscreteSampler of(UniformRandomProvider rng, 217 int numberOfElements, 218 double exponent) { 219 return new RejectionInversionZipfSampler(rng, numberOfElements, exponent); 220 } 221 222 /** 223 * {@code H(x)} is defined as 224 * <ul> 225 * <li>{@code (x^(1 - exponent) - 1) / (1 - exponent)}, if {@code exponent != 1}</li> 226 * <li>{@code log(x)}, if {@code exponent == 1}</li> 227 * </ul> 228 * H(x) is an integral function of h(x), the derivative of H(x) is h(x). 229 * 230 * @param x Free parameter. 231 * @return {@code H(x)}. 232 */ 233 private double hIntegral(final double x) { 234 final double logX = Math.log(x); 235 return helper2((1 - exponent) * logX) * logX; 236 } 237 238 /** 239 * {@code h(x) = 1 / x^exponent}. 240 * 241 * @param x Free parameter. 242 * @return {@code h(x)}. 243 */ 244 private double h(final double x) { 245 return Math.exp(-exponent * Math.log(x)); 246 } 247 248 /** 249 * The inverse function of {@code H(x)}. 250 * 251 * @param x Free parameter 252 * @return y for which {@code H(y) = x}. 253 */ 254 private double hIntegralInverse(final double x) { 255 double t = x * (1 - exponent); 256 if (t < -1) { 257 // Limit value to the range [-1, +inf). 258 // t could be smaller than -1 in some rare cases due to numerical errors. 259 t = -1; 260 } 261 return Math.exp(helper1(t) * x); 262 } 263 264 /** 265 * Helper function that calculates {@code log(1 + x) / x}. 266 * <p> 267 * A Taylor series expansion is used, if x is close to 0. 268 * </p> 269 * 270 * @param x A value larger than or equal to -1. 271 * @return {@code log(1 + x) / x}. 272 */ 273 private static double helper1(final double x) { 274 if (Math.abs(x) > TAYLOR_THRESHOLD) { 275 return Math.log1p(x) / x; 276 } else { 277 return 1 - x * (F_1_2 - x * (F_1_3 - F_1_4 * x)); 278 } 279 } 280 281 /** 282 * Helper function to calculate {@code (exp(x) - 1) / x}. 283 * <p> 284 * A Taylor series expansion is used, if x is close to 0. 285 * </p> 286 * 287 * @param x Free parameter. 288 * @return {@code (exp(x) - 1) / x} if x is non-zero, or 1 if x = 0. 289 */ 290 private static double helper2(final double x) { 291 if (Math.abs(x) > TAYLOR_THRESHOLD) { 292 return Math.expm1(x) / x; 293 } else { 294 return 1 + x * F_1_2 * (1 + x * F_1_3 * (1 + F_1_4 * x)); 295 } 296 } 297}