001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math4.distribution.fitting;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.List;
022
023import org.apache.commons.math4.distribution.MixtureMultivariateNormalDistribution;
024import org.apache.commons.math4.distribution.MultivariateNormalDistribution;
025import org.apache.commons.math4.exception.ConvergenceException;
026import org.apache.commons.math4.exception.DimensionMismatchException;
027import org.apache.commons.math4.exception.NotStrictlyPositiveException;
028import org.apache.commons.math4.exception.NumberIsTooLargeException;
029import org.apache.commons.math4.exception.NumberIsTooSmallException;
030import org.apache.commons.math4.exception.util.LocalizedFormats;
031import org.apache.commons.math4.linear.Array2DRowRealMatrix;
032import org.apache.commons.math4.linear.RealMatrix;
033import org.apache.commons.math4.linear.SingularMatrixException;
034import org.apache.commons.math4.stat.correlation.Covariance;
035import org.apache.commons.math4.util.FastMath;
036import org.apache.commons.math4.util.MathArrays;
037import org.apache.commons.math4.util.Pair;
038
039/**
040 * Expectation-Maximization algorithm for fitting the parameters of
041 * multivariate normal mixture model distributions.
042 *
043 * This implementation is pure original code based on <a
044 * href="https://www.ee.washington.edu/techsite/papers/documents/UWEETR-2010-0002.pdf">
045 * EM Demystified: An Expectation-Maximization Tutorial</a> by Yihua Chen and Maya R. Gupta,
046 * Department of Electrical Engineering, University of Washington, Seattle, WA 98195.
047 * It was verified using external tools like <a
048 * href="http://cran.r-project.org/web/packages/mixtools/index.html">CRAN Mixtools</a>
049 * (see the JUnit test cases) but it is <strong>not</strong> based on Mixtools code at all.
050 * The discussion of the origin of this class can be seen in the comments of the <a
051 * href="https://issues.apache.org/jira/browse/MATH-817">MATH-817</a> JIRA issue.
052 * @since 3.2
053 */
054public class MultivariateNormalMixtureExpectationMaximization {
055    /**
056     * Default maximum number of iterations allowed per fitting process.
057     */
058    private static final int DEFAULT_MAX_ITERATIONS = 1000;
059    /**
060     * Default convergence threshold for fitting.
061     */
062    private static final double DEFAULT_THRESHOLD = 1E-5;
063    /**
064     * The data to fit.
065     */
066    private final double[][] data;
067    /**
068     * The model fit against the data.
069     */
070    private MixtureMultivariateNormalDistribution fittedModel;
071    /**
072     * The log likelihood of the data given the fitted model.
073     */
074    private double logLikelihood;
075
076    /**
077     * Creates an object to fit a multivariate normal mixture model to data.
078     *
079     * @param data Data to use in fitting procedure
080     * @throws NotStrictlyPositiveException if data has no rows
081     * @throws DimensionMismatchException if rows of data have different numbers
082     *             of columns
083     * @throws NumberIsTooSmallException if the number of columns in the data is
084     *             less than 2
085     */
086    public MultivariateNormalMixtureExpectationMaximization(double[][] data)
087        throws NotStrictlyPositiveException,
088               DimensionMismatchException,
089               NumberIsTooSmallException {
090        if (data.length < 1) {
091            throw new NotStrictlyPositiveException(data.length);
092        }
093
094        this.data = new double[data.length][data[0].length];
095
096        for (int i = 0; i < data.length; i++) {
097            if (data[i].length != data[0].length) {
098                // Jagged arrays not allowed
099                throw new DimensionMismatchException(data[i].length,
100                                                     data[0].length);
101            }
102            if (data[i].length < 2) {
103                throw new NumberIsTooSmallException(LocalizedFormats.NUMBER_TOO_SMALL,
104                                                    data[i].length, 2, true);
105            }
106            this.data[i] = Arrays.copyOf(data[i], data[i].length);
107        }
108    }
109
110    /**
111     * Fit a mixture model to the data supplied to the constructor.
112     *
113     * The quality of the fit depends on the concavity of the data provided to
114     * the constructor and the initial mixture provided to this function. If the
115     * data has many local optima, multiple runs of the fitting function with
116     * different initial mixtures may be required to find the optimal solution.
117     * If a SingularMatrixException is encountered, it is possible that another
118     * initialization would work.
119     *
120     * @param initialMixture Model containing initial values of weights and
121     *            multivariate normals
122     * @param maxIterations Maximum iterations allowed for fit
123     * @param threshold Convergence threshold computed as difference in
124     *             logLikelihoods between successive iterations
125     * @throws SingularMatrixException if any component's covariance matrix is
126     *             singular during fitting
127     * @throws NotStrictlyPositiveException if numComponents is less than one
128     *             or threshold is less than Double.MIN_VALUE
129     * @throws DimensionMismatchException if initialMixture mean vector and data
130     *             number of columns are not equal
131     */
132    public void fit(final MixtureMultivariateNormalDistribution initialMixture,
133                    final int maxIterations,
134                    final double threshold)
135            throws SingularMatrixException,
136                   NotStrictlyPositiveException,
137                   DimensionMismatchException {
138        if (maxIterations < 1) {
139            throw new NotStrictlyPositiveException(maxIterations);
140        }
141
142        if (threshold < Double.MIN_VALUE) {
143            throw new NotStrictlyPositiveException(threshold);
144        }
145
146        final int n = data.length;
147
148        // Number of data columns. Jagged data already rejected in constructor,
149        // so we can assume the lengths of each row are equal.
150        final int numCols = data[0].length;
151        final int k = initialMixture.getComponents().size();
152
153        final int numMeanColumns
154            = initialMixture.getComponents().get(0).getSecond().getMeans().length;
155
156        if (numMeanColumns != numCols) {
157            throw new DimensionMismatchException(numMeanColumns, numCols);
158        }
159
160        int numIterations = 0;
161        double previousLogLikelihood = 0d;
162
163        logLikelihood = Double.NEGATIVE_INFINITY;
164
165        // Initialize model to fit to initial mixture.
166        fittedModel = new MixtureMultivariateNormalDistribution(initialMixture.getComponents());
167
168        while (numIterations++ <= maxIterations &&
169               FastMath.abs(previousLogLikelihood - logLikelihood) > threshold) {
170            previousLogLikelihood = logLikelihood;
171            double sumLogLikelihood = 0d;
172
173            // Mixture components
174            final List<Pair<Double, MultivariateNormalDistribution>> components
175                = fittedModel.getComponents();
176
177            // Weight and distribution of each component
178            final double[] weights = new double[k];
179
180            final MultivariateNormalDistribution[] mvns = new MultivariateNormalDistribution[k];
181
182            for (int j = 0; j < k; j++) {
183                weights[j] = components.get(j).getFirst();
184                mvns[j] = components.get(j).getSecond();
185            }
186
187            // E-step: compute the data dependent parameters of the expectation
188            // function.
189            // The percentage of row's total density between a row and a
190            // component
191            final double[][] gamma = new double[n][k];
192
193            // Sum of gamma for each component
194            final double[] gammaSums = new double[k];
195
196            // Sum of gamma times its row for each each component
197            final double[][] gammaDataProdSums = new double[k][numCols];
198
199            for (int i = 0; i < n; i++) {
200                final double rowDensity = fittedModel.density(data[i]);
201                sumLogLikelihood += FastMath.log(rowDensity);
202
203                for (int j = 0; j < k; j++) {
204                    gamma[i][j] = weights[j] * mvns[j].density(data[i]) / rowDensity;
205                    gammaSums[j] += gamma[i][j];
206
207                    for (int col = 0; col < numCols; col++) {
208                        gammaDataProdSums[j][col] += gamma[i][j] * data[i][col];
209                    }
210                }
211            }
212
213            logLikelihood = sumLogLikelihood / n;
214
215            // M-step: compute the new parameters based on the expectation
216            // function.
217            final double[] newWeights = new double[k];
218            final double[][] newMeans = new double[k][numCols];
219
220            for (int j = 0; j < k; j++) {
221                newWeights[j] = gammaSums[j] / n;
222                for (int col = 0; col < numCols; col++) {
223                    newMeans[j][col] = gammaDataProdSums[j][col] / gammaSums[j];
224                }
225            }
226
227            // Compute new covariance matrices
228            final RealMatrix[] newCovMats = new RealMatrix[k];
229            for (int j = 0; j < k; j++) {
230                newCovMats[j] = new Array2DRowRealMatrix(numCols, numCols);
231            }
232            for (int i = 0; i < n; i++) {
233                for (int j = 0; j < k; j++) {
234                    final RealMatrix vec
235                        = new Array2DRowRealMatrix(MathArrays.ebeSubtract(data[i], newMeans[j]));
236                    final RealMatrix dataCov
237                        = vec.multiply(vec.transpose()).scalarMultiply(gamma[i][j]);
238                    newCovMats[j] = newCovMats[j].add(dataCov);
239                }
240            }
241
242            // Converting to arrays for use by fitted model
243            final double[][][] newCovMatArrays = new double[k][numCols][numCols];
244            for (int j = 0; j < k; j++) {
245                newCovMats[j] = newCovMats[j].scalarMultiply(1d / gammaSums[j]);
246                newCovMatArrays[j] = newCovMats[j].getData();
247            }
248
249            // Update current model
250            fittedModel = new MixtureMultivariateNormalDistribution(newWeights,
251                                                                    newMeans,
252                                                                    newCovMatArrays);
253        }
254
255        if (FastMath.abs(previousLogLikelihood - logLikelihood) > threshold) {
256            // Did not converge before the maximum number of iterations
257            throw new ConvergenceException();
258        }
259    }
260
261    /**
262     * Fit a mixture model to the data supplied to the constructor.
263     *
264     * The quality of the fit depends on the concavity of the data provided to
265     * the constructor and the initial mixture provided to this function. If the
266     * data has many local optima, multiple runs of the fitting function with
267     * different initial mixtures may be required to find the optimal solution.
268     * If a SingularMatrixException is encountered, it is possible that another
269     * initialization would work.
270     *
271     * @param initialMixture Model containing initial values of weights and
272     *            multivariate normals
273     * @throws SingularMatrixException if any component's covariance matrix is
274     *             singular during fitting
275     * @throws NotStrictlyPositiveException if numComponents is less than one or
276     *             threshold is less than Double.MIN_VALUE
277     */
278    public void fit(MixtureMultivariateNormalDistribution initialMixture)
279        throws SingularMatrixException,
280               NotStrictlyPositiveException {
281        fit(initialMixture, DEFAULT_MAX_ITERATIONS, DEFAULT_THRESHOLD);
282    }
283
284    /**
285     * Helper method to create a multivariate normal mixture model which can be
286     * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
287     *
288     * This method uses the data supplied to the constructor to try to determine
289     * a good mixture model at which to start the fit, but it is not guaranteed
290     * to supply a model which will find the optimal solution or even converge.
291     *
292     * @param data Data to estimate distribution
293     * @param numComponents Number of components for estimated mixture
294     * @return Multivariate normal mixture model estimated from the data
295     * @throws NumberIsTooLargeException if {@code numComponents} is greater
296     * than the number of data rows.
297     * @throws NumberIsTooSmallException if {@code numComponents < 2}.
298     * @throws NotStrictlyPositiveException if data has less than 2 rows
299     * @throws DimensionMismatchException if rows of data have different numbers
300     *             of columns
301     */
302    public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
303                                                                 final int numComponents)
304        throws NotStrictlyPositiveException,
305               DimensionMismatchException {
306        if (data.length < 2) {
307            throw new NotStrictlyPositiveException(data.length);
308        }
309        if (numComponents < 2) {
310            throw new NumberIsTooSmallException(numComponents, 2, true);
311        }
312        if (numComponents > data.length) {
313            throw new NumberIsTooLargeException(numComponents, data.length, true);
314        }
315
316        final int numRows = data.length;
317        final int numCols = data[0].length;
318
319        // sort the data
320        final DataRow[] sortedData = new DataRow[numRows];
321        for (int i = 0; i < numRows; i++) {
322            sortedData[i] = new DataRow(data[i]);
323        }
324        Arrays.sort(sortedData);
325
326        // uniform weight for each bin
327        final double weight = 1d / numComponents;
328
329        // components of mixture model to be created
330        final List<Pair<Double, MultivariateNormalDistribution>> components =
331                new ArrayList<>(numComponents);
332
333        // create a component based on data in each bin
334        for (int binIndex = 0; binIndex < numComponents; binIndex++) {
335            // minimum index (inclusive) from sorted data for this bin
336            final int minIndex = (binIndex * numRows) / numComponents;
337
338            // maximum index (exclusive) from sorted data for this bin
339            final int maxIndex = ((binIndex + 1) * numRows) / numComponents;
340
341            // number of data records that will be in this bin
342            final int numBinRows = maxIndex - minIndex;
343
344            // data for this bin
345            final double[][] binData = new double[numBinRows][numCols];
346
347            // mean of each column for the data in the this bin
348            final double[] columnMeans = new double[numCols];
349
350            // populate bin and create component
351            for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
352                for (int j = 0; j < numCols; j++) {
353                    final double val = sortedData[i].getRow()[j];
354                    columnMeans[j] += val;
355                    binData[iBin][j] = val;
356                }
357            }
358
359            MathArrays.scaleInPlace(1d / numBinRows, columnMeans);
360
361            // covariance matrix for this bin
362            final double[][] covMat
363                = new Covariance(binData).getCovarianceMatrix().getData();
364            final MultivariateNormalDistribution mvn
365                = new MultivariateNormalDistribution(columnMeans, covMat);
366
367            components.add(new Pair<>(weight, mvn));
368        }
369
370        return new MixtureMultivariateNormalDistribution(components);
371    }
372
373    /**
374     * Gets the log likelihood of the data under the fitted model.
375     *
376     * @return Log likelihood of data or zero of no data has been fit
377     */
378    public double getLogLikelihood() {
379        return logLikelihood;
380    }
381
382    /**
383     * Gets the fitted model.
384     *
385     * @return fitted model or {@code null} if no fit has been performed yet.
386     */
387    public MixtureMultivariateNormalDistribution getFittedModel() {
388        return new MixtureMultivariateNormalDistribution(fittedModel.getComponents());
389    }
390
391    /**
392     * Class used for sorting user-supplied data.
393     */
394    private static class DataRow implements Comparable<DataRow> {
395        /** One data row. */
396        private final double[] row;
397        /** Mean of the data row. */
398        private Double mean;
399
400        /**
401         * Create a data row.
402         * @param data Data to use for the row
403         */
404        DataRow(final double[] data) {
405            // Store reference.
406            row = data;
407            // Compute mean.
408            mean = 0d;
409            for (int i = 0; i < data.length; i++) {
410                mean += data[i];
411            }
412            mean /= data.length;
413        }
414
415        /**
416         * Compare two data rows.
417         * @param other The other row
418         * @return int for sorting
419         */
420        @Override
421        public int compareTo(final DataRow other) {
422            return mean.compareTo(other.mean);
423        }
424
425        /** {@inheritDoc} */
426        @Override
427        public boolean equals(Object other) {
428
429            if (this == other) {
430                return true;
431            }
432
433            if (other instanceof DataRow) {
434                return MathArrays.equals(row, ((DataRow) other).row);
435            }
436
437            return false;
438
439        }
440
441        /** {@inheritDoc} */
442        @Override
443        public int hashCode() {
444            return Arrays.hashCode(row);
445        }
446        /**
447         * Get a data row.
448         * @return data row array
449         */
450        public double[] getRow() {
451            return row;
452        }
453    }
454}
455