ZipfDistribution.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.statistics.distribution;

  18. import org.apache.commons.rng.UniformRandomProvider;
  19. import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;

  20. /**
  21.  * Implementation of the Zipf distribution.
  22.  *
  23.  * <p>The probability mass function of \( X \) is:
  24.  *
  25.  * <p>\[ f(k; N, s) = \frac{1/k^s}{H_{N,s}} \]
  26.  *
  27.  * <p>for \( N \in \{1, 2, 3, \dots\} \) the number of elements,
  28.  * \( s \gt 0 \) the exponent characterizing the distribution,
  29.  * \( k \in \{1, 2, \dots, N\} \) the element rank, and
  30.  * \( H_{N,s} \) is the normalizing constant which corresponds to the
  31.  * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
  32.  * generalized harmonic number</a> of order N of s.
  33.  *
  34.  * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution (Wikipedia)</a>
  35.  */
  36. public final class ZipfDistribution extends AbstractDiscreteDistribution {
  37.     /** Number of elements. */
  38.     private final int numberOfElements;
  39.     /** Exponent parameter of the distribution. */
  40.     private final double exponent;
  41.     /** Cached value of the nth generalized harmonic. */
  42.     private final double nthHarmonic;
  43.     /** Cached value of the log of the nth generalized harmonic. */
  44.     private final double logNthHarmonic;

  45.     /**
  46.      * @param numberOfElements Number of elements.
  47.      * @param exponent Exponent.
  48.      */
  49.     private ZipfDistribution(int numberOfElements,
  50.                              double exponent) {
  51.         this.numberOfElements = numberOfElements;
  52.         this.exponent = exponent;
  53.         this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
  54.         logNthHarmonic = Math.log(nthHarmonic);
  55.     }

  56.     /**
  57.      * Creates a Zipf distribution.
  58.      *
  59.      * @param numberOfElements Number of elements.
  60.      * @param exponent Exponent.
  61.      * @return the distribution
  62.      * @exception IllegalArgumentException if {@code numberOfElements <= 0}
  63.      * or {@code exponent <= 0}.
  64.      */
  65.     public static ZipfDistribution of(int numberOfElements,
  66.                                       double exponent) {
  67.         if (numberOfElements <= 0) {
  68.             throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE,
  69.                                             numberOfElements);
  70.         }
  71.         if (exponent < 0) {
  72.             throw new DistributionException(DistributionException.NEGATIVE,
  73.                                             exponent);
  74.         }
  75.         return new ZipfDistribution(numberOfElements, exponent);
  76.     }

  77.     /**
  78.      * Gets the number of elements parameter of this distribution.
  79.      *
  80.      * @return the number of elements.
  81.      */
  82.     public int getNumberOfElements() {
  83.         return numberOfElements;
  84.     }

  85.     /**
  86.      * Gets the exponent parameter of this distribution.
  87.      *
  88.      * @return the exponent.
  89.      */
  90.     public double getExponent() {
  91.         return exponent;
  92.     }

  93.     /** {@inheritDoc} */
  94.     @Override
  95.     public double probability(final int x) {
  96.         if (x <= 0 || x > numberOfElements) {
  97.             return 0;
  98.         }

  99.         return Math.pow(x, -exponent) / nthHarmonic;
  100.     }

  101.     /** {@inheritDoc} */
  102.     @Override
  103.     public double logProbability(int x) {
  104.         if (x <= 0 || x > numberOfElements) {
  105.             return Double.NEGATIVE_INFINITY;
  106.         }

  107.         return -Math.log(x) * exponent - logNthHarmonic;
  108.     }

  109.     /** {@inheritDoc} */
  110.     @Override
  111.     public double cumulativeProbability(final int x) {
  112.         if (x <= 0) {
  113.             return 0;
  114.         } else if (x >= numberOfElements) {
  115.             return 1;
  116.         }

  117.         return generalizedHarmonic(x, exponent) / nthHarmonic;
  118.     }

  119.     /** {@inheritDoc} */
  120.     @Override
  121.     public double survivalProbability(int x) {
  122.         if (x <= 0) {
  123.             return 1;
  124.         } else if (x >= numberOfElements) {
  125.             return 0;
  126.         }

  127.         // See http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
  128.         // S(x) = P(X > x) = ((x+1)^a Hn,a - (x+1)^a Hx+1,a + 1) / ((x+1)^a Hn,a)
  129.         // where a = exponent and Hx,a is the generalized harmonic for x with exponent a.
  130.         final double z = Math.pow(x + 1.0, exponent);
  131.         // Compute generalizedHarmonic(x, exponent) and generalizedHarmonic(x+1, exponent)
  132.         final double hx = generalizedHarmonic(x, exponent);
  133.         final double hx1 = hx + Math.pow(x + 1.0, -exponent);
  134.         // Compute the survival function
  135.         final double p = (z * (nthHarmonic - hx1) + 1) / (z * nthHarmonic);
  136.         // May overflow for large exponent so validate the probability.
  137.         // If this occurs revert to 1 - CDF(x), reusing the generalized harmonic for x
  138.         return p <= 1.0 ? p : 1.0 - hx / nthHarmonic;
  139.     }

  140.     /**
  141.      * {@inheritDoc}
  142.      *
  143.      * <p>For number of elements \( N \) and exponent \( s \), the mean is:
  144.      *
  145.      * <p>\[ \frac{H_{N,s-1}}{H_{N,s}} \]
  146.      *
  147.      * <p>where \( H_{N,k} \) is the
  148.      * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
  149.      * generalized harmonic number</a> of order \( N \) of \( k \).
  150.      */
  151.     @Override
  152.     public double getMean() {
  153.         final int N = getNumberOfElements();
  154.         final double s = getExponent();

  155.         final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);

  156.         return Hs1 / nthHarmonic;
  157.     }

  158.     /**
  159.      * {@inheritDoc}
  160.      *
  161.      * <p>For number of elements \( N \) and exponent \( s \), the variance is:
  162.      *
  163.      * <p>\[ \frac{H_{N,s-2}}{H_{N,s}} - \frac{H_{N,s-1}^2}{H_{N,s}^2} \]
  164.      *
  165.      * <p>where \( H_{N,k} \) is the
  166.      * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">
  167.      * generalized harmonic number</a> of order \( N \) of \( k \).
  168.      */
  169.     @Override
  170.     public double getVariance() {
  171.         final int N = getNumberOfElements();
  172.         final double s = getExponent();

  173.         final double Hs2 = generalizedHarmonicAscendingSum(N, s - 2);
  174.         final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1);
  175.         final double Hs = nthHarmonic;

  176.         return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
  177.     }

  178.     /**
  179.      * Calculates the Nth generalized harmonic number. See
  180.      * <a href="https://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
  181.      * Series</a>.
  182.      *
  183.      * <p>Assumes {@code exponent > 0} to arrange the terms to sum from small to large.
  184.      *
  185.      * @param n Term in the series to calculate (must be larger than 1)
  186.      * @param m Exponent (special case {@code m = 1} is the harmonic series).
  187.      * @return the n<sup>th</sup> generalized harmonic number.
  188.      */
  189.     private static double generalizedHarmonic(final int n, final double m) {
  190.         double value = 0;
  191.         // Sum small to large
  192.         for (int k = n; k >= 1; k--) {
  193.             value += Math.pow(k, -m);
  194.         }
  195.         return value;
  196.     }

  197.     /**
  198.      * Calculates the Nth generalized harmonic number.
  199.      *
  200.      * <p>Checks the value of the {@code exponent} to arrange the terms to sum from from small to large.
  201.      *
  202.      * @param n Term in the series to calculate (must be larger than 1)
  203.      * @param m Exponent (special case {@code m = 1} is the harmonic series).
  204.      * @return the n<sup>th</sup> generalized harmonic number.
  205.      */
  206.     private static double generalizedHarmonicAscendingSum(final int n, final double m) {
  207.         double value = 0;
  208.         // Sum small to large
  209.         // If m < 0 then sum ascending, otherwise descending
  210.         if (m < 0) {
  211.             for (int k = 1; k <= n; k++) {
  212.                 value += Math.pow(k, -m);
  213.             }
  214.         } else {
  215.             for (int k = n; k >= 1; k--) {
  216.                 value += Math.pow(k, -m);
  217.             }
  218.         }
  219.         return value;
  220.     }

  221.     /**
  222.      * {@inheritDoc}
  223.      *
  224.      * <p>The lower bound of the support is always 1.
  225.      *
  226.      * @return 1.
  227.      */
  228.     @Override
  229.     public int getSupportLowerBound() {
  230.         return 1;
  231.     }

  232.     /**
  233.      * {@inheritDoc}
  234.      *
  235.      * <p>The upper bound of the support is the number of elements.
  236.      *
  237.      * @return number of elements.
  238.      */
  239.     @Override
  240.     public int getSupportUpperBound() {
  241.         return getNumberOfElements();
  242.     }

  243.     /** {@inheritDoc} */
  244.     @Override
  245.     public DiscreteDistribution.Sampler createSampler(final UniformRandomProvider rng) {
  246.         // Zipf distribution sampler.
  247.         return RejectionInversionZipfSampler.of(rng, numberOfElements, exponent)::sample;
  248.     }
  249. }