001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math4.legacy.ml.clustering;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collection;
023import java.util.List;
024
025import org.apache.commons.rng.UniformRandomProvider;
026import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException;
027import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
028import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean;
029
030/**
031 * Implementation of k-means++ algorithm.
032 * It is based on
033 * <blockquote>
034 *  Elkan, Charles.
035 *  "Using the triangle inequality to accelerate k-means."
036 *  ICML. Vol. 3. 2003.
037 * </blockquote>
038 *
039 * <p>
040 * Algorithm uses triangle inequality to speed up computation, by reducing
041 * the amount of distances calculations.  Towards the last iterations of
042 * the algorithm, points which already assigned to some cluster are unlikely
043 * to move to a new cluster; updates of cluster centers are also usually
044 * relatively small.
045 * Triangle inequality is thus used to determine the cases where distance
046 * computation could be skipped since center move only a little, without
047 * affecting points partitioning.
048 *
049 * <p>
050 * For initial centers seeding, we apply the algorithm described in
051 * <blockquote>
052 *  Arthur, David, and Sergei Vassilvitskii.
053 *  "k-means++: The advantages of careful seeding."
054 *  Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
055 *  Society for Industrial and Applied Mathematics, 2007.
056 * </blockquote>
057 *
058 * @param <T> Type of the points to cluster.
059 */
060public class ElkanKMeansPlusPlusClusterer<T extends Clusterable>
061    extends KMeansPlusPlusClusterer<T> {
062
063    /**
064     * @param k Clustering parameter.
065     */
066    public ElkanKMeansPlusPlusClusterer(int k) {
067        super(k);
068    }
069
070    /**
071     * @param k Clustering parameter.
072     * @param maxIterations Allowed number of iterations.
073     * @param measure Distance measure.
074     * @param random Random generator.
075     */
076    public ElkanKMeansPlusPlusClusterer(int k,
077                                        int maxIterations,
078                                        DistanceMeasure measure,
079                                        UniformRandomProvider random) {
080        super(k, maxIterations, measure, random);
081    }
082
083    /**
084     * @param k Clustering parameter.
085     * @param maxIterations Allowed number of iterations.
086     * @param measure Distance measure.
087     * @param random Random generator.
088     * @param emptyStrategy Strategy for handling empty clusters that
089     * may appear during algorithm progress.
090     */
091    public ElkanKMeansPlusPlusClusterer(int k,
092                                        int maxIterations,
093                                        DistanceMeasure measure,
094                                        UniformRandomProvider random,
095                                        EmptyClusterStrategy emptyStrategy) {
096        super(k, maxIterations, measure, random, emptyStrategy);
097    }
098
099    /** {@inheritDoc} */
100    @Override
101    public List<CentroidCluster<T>> cluster(final Collection<T> points) {
102        final int k = getNumberOfClusters();
103
104        // Number of clusters has to be smaller or equal the number of data points.
105        if (points.size() < k) {
106            throw new NumberIsTooSmallException(points.size(), k, false);
107        }
108
109        final List<T> pointsList = new ArrayList<>(points);
110        final int n = points.size();
111        final int dim = pointsList.get(0).getPoint().length;
112
113        // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is
114        // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c)
115        final double[] s = new double[k];
116        Arrays.fill(s, Double.MAX_VALUE);
117        // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2)
118        final double[][] dcc = new double[k][k];
119
120        // For each point keeps the upper bound distance to the cluster center.
121        final double[] u = new double[n];
122        Arrays.fill(u, Double.MAX_VALUE);
123
124        // For each point and for each cluster keeps the lower bound for the distance between the point and cluster
125        final double[][] l = new double[n][k];
126
127        // Seed initial set of cluster centers.
128        final double[][] centers = seed(pointsList);
129
130        // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates
131        // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c).
132        final int[] partitions = partitionPoints(pointsList, centers, u, l);
133
134        final double[] deltas = new double[k];
135        VectorialMean[] means = new VectorialMean[k];
136        for (int it = 0, max = getMaxIterations();
137             it < max;
138             it++) {
139            int changes = 0;
140            // Step I.
141            // Compute inter-cluster distances.
142            updateIntraCentersDistances(centers, dcc, s);
143
144            for (int xi = 0; xi < n; xi++) {
145                boolean r = true;
146
147                // Step II.
148                if (u[xi] <= s[partitions[xi]]) {
149                    continue;
150                }
151
152                for (int c = 0; c < k; c++) {
153                    // Check condition III.
154                    if (isSkipNext(partitions, u, l, dcc, xi, c)) {
155                        continue;
156                    }
157
158                    final double[] x = pointsList.get(xi).getPoint();
159
160                    // III(a)
161                    if (r) {
162                        u[xi] = distance(x, centers[partitions[xi]]);
163                        l[xi][partitions[xi]] = u[xi];
164                        r = false;
165                    }
166                    // III(b)
167                    if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) {
168                        l[xi][c] = distance(x, centers[c]);
169                        if (l[xi][c] < u[xi]) {
170                            partitions[xi] = c;
171                            u[xi] = l[xi][c];
172                            ++changes;
173                        }
174                    }
175                }
176            }
177
178            // Stopping criterion.
179            if (changes == 0 &&
180                it != 0) { // First iteration needed (to update bounds).
181                break;
182            }
183
184            // Step IV.
185            Arrays.fill(means, null);
186            for (int i = 0; i < n; i++) {
187                if (means[partitions[i]] == null) {
188                    means[partitions[i]] = new VectorialMean(dim);
189                }
190                means[partitions[i]].increment(pointsList.get(i).getPoint());
191            }
192
193            for (int i = 0; i < k; i++) {
194                deltas[i] = distance(centers[i], means[i].getResult());
195                centers[i] = means[i].getResult();
196            }
197
198            updateBounds(partitions, u, l, deltas);
199        }
200
201        return buildResults(pointsList, partitions, centers);
202    }
203
204    /**
205     * kmeans++ seeding which provides guarantee of resulting with log(k) approximation
206     * for final clustering results
207     * <p>
208     * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding."
209     * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
210     * Society for Industrial and Applied Mathematics, 2007.
211     *
212     * @param points input data points
213     * @return an array of initial clusters centers
214     *
215     */
216    private double[][] seed(final List<T> points) {
217        final int k = getNumberOfClusters();
218        final UniformRandomProvider random = getRandomGenerator();
219
220        final double[][] result = new double[k][];
221        final int n = points.size();
222        final int pointIndex = random.nextInt(n);
223
224        final double[] minDistances = new double[n];
225
226        int idx = 0;
227        result[idx] = points.get(pointIndex).getPoint();
228
229        double sumSqDist = 0;
230
231        for (int i = 0; i < n; i++) {
232            final double d = distance(result[idx], points.get(i).getPoint());
233            minDistances[i] = d * d;
234            sumSqDist += minDistances[i];
235        }
236
237        while (++idx < k) {
238            final double p = sumSqDist * random.nextDouble();
239            int next = 0;
240            for (double cdf = 0; cdf < p; next++) {
241                cdf += minDistances[next];
242            }
243
244            result[idx] = points.get(next - 1).getPoint();
245            for (int i = 0; i < n; i++) {
246                final double d = distance(result[idx], points.get(i).getPoint());
247                sumSqDist -= minDistances[i];
248                minDistances[i] = Math.min(minDistances[i], d * d);
249                sumSqDist += minDistances[i];
250            }
251        }
252
253        return result;
254    }
255
256
257    /**
258     * Once initial centers are chosen, we can actually go through data points and assign points to the
259     * cluster based on the distance between initial centers and points.
260     *
261     * @param pointsList data points list
262     * @param centers current clusters centers
263     * @param u points upper bounds
264     * @param l lower bounds for points to clusters centers
265     *
266     * @return initial assignment of points into clusters
267     */
268    private int[] partitionPoints(List<T> pointsList,
269                                  double[][] centers,
270                                  double[] u,
271                                  double[][] l) {
272        final int k = getNumberOfClusters();
273        final int n = pointsList.size();
274        // Points assignments vector.
275        final int[] assignments = new int[n];
276        Arrays.fill(assignments, -1);
277        // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c)
278        for (int i = 0; i < n; i++) {
279            final double[] x = pointsList.get(i).getPoint();
280            for (int j = 0; j < k; j++) {
281                l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c)
282                if (u[i] > l[i][j]) {
283                    u[i] = l[i][j]; // u(x) = min_c d(x, c)
284                    assignments[i] = j; // c(x) = argmin_c d(x, c)
285                }
286            }
287        }
288        return assignments;
289    }
290
291    /**
292     * Updated distances between clusters centers and for each cluster
293     * pick the closest neighbour and keep distance to it.
294     *
295     * @param centers cluster centers
296     * @param dcc matrix of distance between clusters centers, e.g.
297     * {@code dcc[i][j] = distance(centers[i], centers[j])}
298     * @param s For a given cluster, {@code s[si]} holds distance value
299     * to the closest cluster center.
300     */
301    private void updateIntraCentersDistances(double[][] centers,
302                                             double[][] dcc,
303                                             double[] s) {
304        final int k = getNumberOfClusters();
305        for (int i = 0; i < k; i++) {
306            // Since distance(xi, xj) == distance(xj, xi), we need to update
307            // only upper or lower triangle of the distances matrix and mirror
308            // to the lower of upper triangle accordingly, trace has to be all
309            // zeros, since distance(xi, xi) == 0.
310            for (int j = i + 1; j < k; j++) {
311                dcc[i][j] = 0.5 * distance(centers[i], centers[j]);
312                dcc[j][i] = dcc[i][j];
313                if (dcc[i][j] < s[i]) {
314                    s[i] = dcc[i][j];
315                }
316                if (dcc[j][i] < s[j]) {
317                    s[j] = dcc[j][i];
318                }
319            }
320        }
321    }
322
323    /**
324     * For given points and and cluster, check condition (3) of Elkan algorithm.
325     *
326     * <ul>
327     *  <li>c is not the cluster xi assigned to</li>
328     *  <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than
329     *   lower bound between xi and some cluster c</li>
330     *  <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than
331     *   distance between center of xi's cluster and c</li>
332     * </ul>
333     *
334     * @param partitions current partition of points into clusters
335     * @param u upper bounds for points
336     * @param l lower bounds for distance between cluster centers and points
337     * @param dcc matrix of distance between clusters centers
338     * @param xi index of the point
339     * @param c index of the cluster
340     * @return true if conditions above satisfied false otherwise
341     */
342    private static boolean isSkipNext(int[] partitions,
343                                      double[] u,
344                                      double[][] l,
345                                      double[][] dcc,
346                                      int xi,
347                                      int c) {
348        return c == partitions[xi] ||
349               u[xi] <= l[xi][c] ||
350               u[xi] <= dcc[partitions[xi]][c];
351    }
352
353    /**
354     * Once kmeans iterations have been converged and no more movements, we can build up the final
355     * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based
356     * on the converged partitioning.
357     *
358     * @param pointsList list of data points
359     * @param partitions current partition of points into clusters
360     * @param centers cluster centers
361     * @return cluster partitioning
362     */
363    private List<CentroidCluster<T>> buildResults(List<T> pointsList,
364                                                  int[] partitions,
365                                                  double[][] centers) {
366        final int k = getNumberOfClusters();
367        final List<CentroidCluster<T>> result = new ArrayList<>();
368        for (int i = 0; i < k; i++) {
369            final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i]));
370            result.add(cluster);
371        }
372        for (int i = 0; i < pointsList.size(); i++) {
373            result.get(partitions[i]).addPoint(pointsList.get(i));
374        }
375        return result;
376    }
377
378    /**
379     * Based on the distance that cluster center has moved we need to update our upper and lower bound.
380     * Worst case assumption, the center of the assigned to given cluster moves away from the point, while
381     * centers of over clusters become closer.
382     *
383     * @param partitions current points assiments to the clusters
384     * @param u points upper bounds
385     * @param l lower bounds for distances between point and corresponding cluster
386     * @param deltas the movement delta for each cluster center
387     */
388    private void updateBounds(int[] partitions,
389                              double[] u,
390                              double[][] l,
391                              double[] deltas) {
392        final int k = getNumberOfClusters();
393        for (int i = 0; i < partitions.length; i++) {
394            u[i] += deltas[partitions[i]];
395            for (int j = 0; j < k; j++) {
396                l[i][j] = Math.max(0, l[i][j] - deltas[j]);
397            }
398        }
399    }
400
401    /**
402     * @param a Coordinates.
403     * @param b Coordinates.
404     * @return the distance between {@code a} and {@code b}.
405     */
406    private double distance(final double[] a,
407                            final double[] b) {
408        return getDistanceMeasure().compute(a, b);
409    }
410}