001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.math3.distribution.fitting; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.List; 022 023import org.apache.commons.math3.distribution.MultivariateNormalDistribution; 024import org.apache.commons.math3.distribution.MixtureMultivariateNormalDistribution; 025import org.apache.commons.math3.exception.ConvergenceException; 026import org.apache.commons.math3.exception.DimensionMismatchException; 027import org.apache.commons.math3.exception.NotStrictlyPositiveException; 028import org.apache.commons.math3.exception.NumberIsTooSmallException; 029import org.apache.commons.math3.exception.NumberIsTooLargeException; 030import org.apache.commons.math3.exception.util.LocalizedFormats; 031import org.apache.commons.math3.linear.Array2DRowRealMatrix; 032import org.apache.commons.math3.linear.RealMatrix; 033import org.apache.commons.math3.linear.SingularMatrixException; 034import org.apache.commons.math3.stat.correlation.Covariance; 035import org.apache.commons.math3.util.FastMath; 036import org.apache.commons.math3.util.MathArrays; 037import org.apache.commons.math3.util.Pair; 038 039/** 040 * Expectation-Maximization</a> algorithm for fitting the parameters of 041 * multivariate normal mixture model distributions. 042 * 043 * This implementation is pure original code based on <a 044 * href="https://www.ee.washington.edu/techsite/papers/documents/UWEETR-2010-0002.pdf"> 045 * EM Demystified: An Expectation-Maximization Tutorial</a> by Yihua Chen and Maya R. Gupta, 046 * Department of Electrical Engineering, University of Washington, Seattle, WA 98195. 047 * It was verified using external tools like <a 048 * href="http://cran.r-project.org/web/packages/mixtools/index.html">CRAN Mixtools</a> 049 * (see the JUnit test cases) but it is <strong>not</strong> based on Mixtools code at all. 050 * The discussion of the origin of this class can be seen in the comments of the <a 051 * href="https://issues.apache.org/jira/browse/MATH-817">MATH-817</a> JIRA issue. 052 * @since 3.2 053 */ 054public class MultivariateNormalMixtureExpectationMaximization { 055 /** 056 * Default maximum number of iterations allowed per fitting process. 057 */ 058 private static final int DEFAULT_MAX_ITERATIONS = 1000; 059 /** 060 * Default convergence threshold for fitting. 061 */ 062 private static final double DEFAULT_THRESHOLD = 1E-5; 063 /** 064 * The data to fit. 065 */ 066 private final double[][] data; 067 /** 068 * The model fit against the data. 069 */ 070 private MixtureMultivariateNormalDistribution fittedModel; 071 /** 072 * The log likelihood of the data given the fitted model. 073 */ 074 private double logLikelihood = 0d; 075 076 /** 077 * Creates an object to fit a multivariate normal mixture model to data. 078 * 079 * @param data Data to use in fitting procedure 080 * @throws NotStrictlyPositiveException if data has no rows 081 * @throws DimensionMismatchException if rows of data have different numbers 082 * of columns 083 * @throws NumberIsTooSmallException if the number of columns in the data is 084 * less than 2 085 */ 086 public MultivariateNormalMixtureExpectationMaximization(double[][] data) 087 throws NotStrictlyPositiveException, 088 DimensionMismatchException, 089 NumberIsTooSmallException { 090 if (data.length < 1) { 091 throw new NotStrictlyPositiveException(data.length); 092 } 093 094 this.data = new double[data.length][data[0].length]; 095 096 for (int i = 0; i < data.length; i++) { 097 if (data[i].length != data[0].length) { 098 // Jagged arrays not allowed 099 throw new DimensionMismatchException(data[i].length, 100 data[0].length); 101 } 102 if (data[i].length < 2) { 103 throw new NumberIsTooSmallException(LocalizedFormats.NUMBER_TOO_SMALL, 104 data[i].length, 2, true); 105 } 106 this.data[i] = MathArrays.copyOf(data[i], data[i].length); 107 } 108 } 109 110 /** 111 * Fit a mixture model to the data supplied to the constructor. 112 * 113 * The quality of the fit depends on the concavity of the data provided to 114 * the constructor and the initial mixture provided to this function. If the 115 * data has many local optima, multiple runs of the fitting function with 116 * different initial mixtures may be required to find the optimal solution. 117 * If a SingularMatrixException is encountered, it is possible that another 118 * initialization would work. 119 * 120 * @param initialMixture Model containing initial values of weights and 121 * multivariate normals 122 * @param maxIterations Maximum iterations allowed for fit 123 * @param threshold Convergence threshold computed as difference in 124 * logLikelihoods between successive iterations 125 * @throws SingularMatrixException if any component's covariance matrix is 126 * singular during fitting 127 * @throws NotStrictlyPositiveException if numComponents is less than one 128 * or threshold is less than Double.MIN_VALUE 129 * @throws DimensionMismatchException if initialMixture mean vector and data 130 * number of columns are not equal 131 */ 132 public void fit(final MixtureMultivariateNormalDistribution initialMixture, 133 final int maxIterations, 134 final double threshold) 135 throws SingularMatrixException, 136 NotStrictlyPositiveException, 137 DimensionMismatchException { 138 if (maxIterations < 1) { 139 throw new NotStrictlyPositiveException(maxIterations); 140 } 141 142 if (threshold < Double.MIN_VALUE) { 143 throw new NotStrictlyPositiveException(threshold); 144 } 145 146 final int n = data.length; 147 148 // Number of data columns. Jagged data already rejected in constructor, 149 // so we can assume the lengths of each row are equal. 150 final int numCols = data[0].length; 151 final int k = initialMixture.getComponents().size(); 152 153 final int numMeanColumns 154 = initialMixture.getComponents().get(0).getSecond().getMeans().length; 155 156 if (numMeanColumns != numCols) { 157 throw new DimensionMismatchException(numMeanColumns, numCols); 158 } 159 160 int numIterations = 0; 161 double previousLogLikelihood = 0d; 162 163 logLikelihood = Double.NEGATIVE_INFINITY; 164 165 // Initialize model to fit to initial mixture. 166 fittedModel = new MixtureMultivariateNormalDistribution(initialMixture.getComponents()); 167 168 while (numIterations++ <= maxIterations && 169 FastMath.abs(previousLogLikelihood - logLikelihood) > threshold) { 170 previousLogLikelihood = logLikelihood; 171 double sumLogLikelihood = 0d; 172 173 // Mixture components 174 final List<Pair<Double, MultivariateNormalDistribution>> components 175 = fittedModel.getComponents(); 176 177 // Weight and distribution of each component 178 final double[] weights = new double[k]; 179 180 final MultivariateNormalDistribution[] mvns = new MultivariateNormalDistribution[k]; 181 182 for (int j = 0; j < k; j++) { 183 weights[j] = components.get(j).getFirst(); 184 mvns[j] = components.get(j).getSecond(); 185 } 186 187 // E-step: compute the data dependent parameters of the expectation 188 // function. 189 // The percentage of row's total density between a row and a 190 // component 191 final double[][] gamma = new double[n][k]; 192 193 // Sum of gamma for each component 194 final double[] gammaSums = new double[k]; 195 196 // Sum of gamma times its row for each each component 197 final double[][] gammaDataProdSums = new double[k][numCols]; 198 199 for (int i = 0; i < n; i++) { 200 final double rowDensity = fittedModel.density(data[i]); 201 sumLogLikelihood += FastMath.log(rowDensity); 202 203 for (int j = 0; j < k; j++) { 204 gamma[i][j] = weights[j] * mvns[j].density(data[i]) / rowDensity; 205 gammaSums[j] += gamma[i][j]; 206 207 for (int col = 0; col < numCols; col++) { 208 gammaDataProdSums[j][col] += gamma[i][j] * data[i][col]; 209 } 210 } 211 } 212 213 logLikelihood = sumLogLikelihood / n; 214 215 // M-step: compute the new parameters based on the expectation 216 // function. 217 final double[] newWeights = new double[k]; 218 final double[][] newMeans = new double[k][numCols]; 219 220 for (int j = 0; j < k; j++) { 221 newWeights[j] = gammaSums[j] / n; 222 for (int col = 0; col < numCols; col++) { 223 newMeans[j][col] = gammaDataProdSums[j][col] / gammaSums[j]; 224 } 225 } 226 227 // Compute new covariance matrices 228 final RealMatrix[] newCovMats = new RealMatrix[k]; 229 for (int j = 0; j < k; j++) { 230 newCovMats[j] = new Array2DRowRealMatrix(numCols, numCols); 231 } 232 for (int i = 0; i < n; i++) { 233 for (int j = 0; j < k; j++) { 234 final RealMatrix vec 235 = new Array2DRowRealMatrix(MathArrays.ebeSubtract(data[i], newMeans[j])); 236 final RealMatrix dataCov 237 = vec.multiply(vec.transpose()).scalarMultiply(gamma[i][j]); 238 newCovMats[j] = newCovMats[j].add(dataCov); 239 } 240 } 241 242 // Converting to arrays for use by fitted model 243 final double[][][] newCovMatArrays = new double[k][numCols][numCols]; 244 for (int j = 0; j < k; j++) { 245 newCovMats[j] = newCovMats[j].scalarMultiply(1d / gammaSums[j]); 246 newCovMatArrays[j] = newCovMats[j].getData(); 247 } 248 249 // Update current model 250 fittedModel = new MixtureMultivariateNormalDistribution(newWeights, 251 newMeans, 252 newCovMatArrays); 253 } 254 255 if (FastMath.abs(previousLogLikelihood - logLikelihood) > threshold) { 256 // Did not converge before the maximum number of iterations 257 throw new ConvergenceException(); 258 } 259 } 260 261 /** 262 * Fit a mixture model to the data supplied to the constructor. 263 * 264 * The quality of the fit depends on the concavity of the data provided to 265 * the constructor and the initial mixture provided to this function. If the 266 * data has many local optima, multiple runs of the fitting function with 267 * different initial mixtures may be required to find the optimal solution. 268 * If a SingularMatrixException is encountered, it is possible that another 269 * initialization would work. 270 * 271 * @param initialMixture Model containing initial values of weights and 272 * multivariate normals 273 * @throws SingularMatrixException if any component's covariance matrix is 274 * singular during fitting 275 * @throws NotStrictlyPositiveException if numComponents is less than one or 276 * threshold is less than Double.MIN_VALUE 277 */ 278 public void fit(MixtureMultivariateNormalDistribution initialMixture) 279 throws SingularMatrixException, 280 NotStrictlyPositiveException { 281 fit(initialMixture, DEFAULT_MAX_ITERATIONS, DEFAULT_THRESHOLD); 282 } 283 284 /** 285 * Helper method to create a multivariate normal mixture model which can be 286 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}. 287 * 288 * This method uses the data supplied to the constructor to try to determine 289 * a good mixture model at which to start the fit, but it is not guaranteed 290 * to supply a model which will find the optimal solution or even converge. 291 * 292 * @param data Data to estimate distribution 293 * @param numComponents Number of components for estimated mixture 294 * @return Multivariate normal mixture model estimated from the data 295 * @throws NumberIsTooLargeException if {@code numComponents} is greater 296 * than the number of data rows. 297 * @throws NumberIsTooSmallException if {@code numComponents < 2}. 298 * @throws NotStrictlyPositiveException if data has less than 2 rows 299 * @throws DimensionMismatchException if rows of data have different numbers 300 * of columns 301 */ 302 public static MixtureMultivariateNormalDistribution estimate(final double[][] data, 303 final int numComponents) 304 throws NotStrictlyPositiveException, 305 DimensionMismatchException { 306 if (data.length < 2) { 307 throw new NotStrictlyPositiveException(data.length); 308 } 309 if (numComponents < 2) { 310 throw new NumberIsTooSmallException(numComponents, 2, true); 311 } 312 if (numComponents > data.length) { 313 throw new NumberIsTooLargeException(numComponents, data.length, true); 314 } 315 316 final int numRows = data.length; 317 final int numCols = data[0].length; 318 319 // sort the data 320 final DataRow[] sortedData = new DataRow[numRows]; 321 for (int i = 0; i < numRows; i++) { 322 sortedData[i] = new DataRow(data[i]); 323 } 324 Arrays.sort(sortedData); 325 326 // uniform weight for each bin 327 final double weight = 1d / numComponents; 328 329 // components of mixture model to be created 330 final List<Pair<Double, MultivariateNormalDistribution>> components = 331 new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents); 332 333 // create a component based on data in each bin 334 for (int binIndex = 0; binIndex < numComponents; binIndex++) { 335 // minimum index (inclusive) from sorted data for this bin 336 final int minIndex = (binIndex * numRows) / numComponents; 337 338 // maximum index (exclusive) from sorted data for this bin 339 final int maxIndex = ((binIndex + 1) * numRows) / numComponents; 340 341 // number of data records that will be in this bin 342 final int numBinRows = maxIndex - minIndex; 343 344 // data for this bin 345 final double[][] binData = new double[numBinRows][numCols]; 346 347 // mean of each column for the data in the this bin 348 final double[] columnMeans = new double[numCols]; 349 350 // populate bin and create component 351 for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) { 352 for (int j = 0; j < numCols; j++) { 353 final double val = sortedData[i].getRow()[j]; 354 columnMeans[j] += val; 355 binData[iBin][j] = val; 356 } 357 } 358 359 MathArrays.scaleInPlace(1d / numBinRows, columnMeans); 360 361 // covariance matrix for this bin 362 final double[][] covMat 363 = new Covariance(binData).getCovarianceMatrix().getData(); 364 final MultivariateNormalDistribution mvn 365 = new MultivariateNormalDistribution(columnMeans, covMat); 366 367 components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn)); 368 } 369 370 return new MixtureMultivariateNormalDistribution(components); 371 } 372 373 /** 374 * Gets the log likelihood of the data under the fitted model. 375 * 376 * @return Log likelihood of data or zero of no data has been fit 377 */ 378 public double getLogLikelihood() { 379 return logLikelihood; 380 } 381 382 /** 383 * Gets the fitted model. 384 * 385 * @return fitted model or {@code null} if no fit has been performed yet. 386 */ 387 public MixtureMultivariateNormalDistribution getFittedModel() { 388 return new MixtureMultivariateNormalDistribution(fittedModel.getComponents()); 389 } 390 391 /** 392 * Class used for sorting user-supplied data. 393 */ 394 private static class DataRow implements Comparable<DataRow> { 395 /** One data row. */ 396 private final double[] row; 397 /** Mean of the data row. */ 398 private Double mean; 399 400 /** 401 * Create a data row. 402 * @param data Data to use for the row 403 */ 404 DataRow(final double[] data) { 405 // Store reference. 406 row = data; 407 // Compute mean. 408 mean = 0d; 409 for (int i = 0; i < data.length; i++) { 410 mean += data[i]; 411 } 412 mean /= data.length; 413 } 414 415 /** 416 * Compare two data rows. 417 * @param other The other row 418 * @return int for sorting 419 */ 420 public int compareTo(final DataRow other) { 421 return mean.compareTo(other.mean); 422 } 423 424 /** {@inheritDoc} */ 425 @Override 426 public boolean equals(Object other) { 427 428 if (this == other) { 429 return true; 430 } 431 432 if (other instanceof DataRow) { 433 return MathArrays.equals(row, ((DataRow) other).row); 434 } 435 436 return false; 437 438 } 439 440 /** {@inheritDoc} */ 441 @Override 442 public int hashCode() { 443 return Arrays.hashCode(row); 444 } 445 /** 446 * Get a data row. 447 * @return data row array 448 */ 449 public double[] getRow() { 450 return row; 451 } 452 } 453} 454