1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math4.legacy.ml.clustering;
19
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collection;
23 import java.util.List;
24
25 import org.apache.commons.rng.UniformRandomProvider;
26 import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException;
27 import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
28 import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean;
29
30 /**
31 * Implementation of k-means++ algorithm.
32 * It is based on
33 * <blockquote>
34 * Elkan, Charles.
35 * "Using the triangle inequality to accelerate k-means."
36 * ICML. Vol. 3. 2003.
37 * </blockquote>
38 *
39 * <p>
40 * Algorithm uses triangle inequality to speed up computation, by reducing
41 * the amount of distances calculations. Towards the last iterations of
42 * the algorithm, points which already assigned to some cluster are unlikely
43 * to move to a new cluster; updates of cluster centers are also usually
44 * relatively small.
45 * Triangle inequality is thus used to determine the cases where distance
46 * computation could be skipped since center move only a little, without
47 * affecting points partitioning.
48 *
49 * <p>
50 * For initial centers seeding, we apply the algorithm described in
51 * <blockquote>
52 * Arthur, David, and Sergei Vassilvitskii.
53 * "k-means++: The advantages of careful seeding."
54 * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
55 * Society for Industrial and Applied Mathematics, 2007.
56 * </blockquote>
57 *
58 * @param <T> Type of the points to cluster.
59 */
60 public class ElkanKMeansPlusPlusClusterer<T extends Clusterable>
61 extends KMeansPlusPlusClusterer<T> {
62
63 /**
64 * @param k Clustering parameter.
65 */
66 public ElkanKMeansPlusPlusClusterer(int k) {
67 super(k);
68 }
69
70 /**
71 * @param k Clustering parameter.
72 * @param maxIterations Allowed number of iterations.
73 * @param measure Distance measure.
74 * @param random Random generator.
75 */
76 public ElkanKMeansPlusPlusClusterer(int k,
77 int maxIterations,
78 DistanceMeasure measure,
79 UniformRandomProvider random) {
80 super(k, maxIterations, measure, random);
81 }
82
83 /**
84 * @param k Clustering parameter.
85 * @param maxIterations Allowed number of iterations.
86 * @param measure Distance measure.
87 * @param random Random generator.
88 * @param emptyStrategy Strategy for handling empty clusters that
89 * may appear during algorithm progress.
90 */
91 public ElkanKMeansPlusPlusClusterer(int k,
92 int maxIterations,
93 DistanceMeasure measure,
94 UniformRandomProvider random,
95 EmptyClusterStrategy emptyStrategy) {
96 super(k, maxIterations, measure, random, emptyStrategy);
97 }
98
99 /** {@inheritDoc} */
100 @Override
101 public List<CentroidCluster<T>> cluster(final Collection<T> points) {
102 final int k = getNumberOfClusters();
103
104 // Number of clusters has to be smaller or equal the number of data points.
105 if (points.size() < k) {
106 throw new NumberIsTooSmallException(points.size(), k, false);
107 }
108
109 final List<T> pointsList = new ArrayList<>(points);
110 final int n = points.size();
111 final int dim = pointsList.get(0).getPoint().length;
112
113 // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is
114 // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c)
115 final double[] s = new double[k];
116 Arrays.fill(s, Double.MAX_VALUE);
117 // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2)
118 final double[][] dcc = new double[k][k];
119
120 // For each point keeps the upper bound distance to the cluster center.
121 final double[] u = new double[n];
122 Arrays.fill(u, Double.MAX_VALUE);
123
124 // For each point and for each cluster keeps the lower bound for the distance between the point and cluster
125 final double[][] l = new double[n][k];
126
127 // Seed initial set of cluster centers.
128 final double[][] centers = seed(pointsList);
129
130 // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates
131 // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c).
132 final int[] partitions = partitionPoints(pointsList, centers, u, l);
133
134 final double[] deltas = new double[k];
135 VectorialMean[] means = new VectorialMean[k];
136 for (int it = 0, max = getMaxIterations();
137 it < max;
138 it++) {
139 int changes = 0;
140 // Step I.
141 // Compute inter-cluster distances.
142 updateIntraCentersDistances(centers, dcc, s);
143
144 for (int xi = 0; xi < n; xi++) {
145 boolean r = true;
146
147 // Step II.
148 if (u[xi] <= s[partitions[xi]]) {
149 continue;
150 }
151
152 for (int c = 0; c < k; c++) {
153 // Check condition III.
154 if (isSkipNext(partitions, u, l, dcc, xi, c)) {
155 continue;
156 }
157
158 final double[] x = pointsList.get(xi).getPoint();
159
160 // III(a)
161 if (r) {
162 u[xi] = distance(x, centers[partitions[xi]]);
163 l[xi][partitions[xi]] = u[xi];
164 r = false;
165 }
166 // III(b)
167 if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) {
168 l[xi][c] = distance(x, centers[c]);
169 if (l[xi][c] < u[xi]) {
170 partitions[xi] = c;
171 u[xi] = l[xi][c];
172 ++changes;
173 }
174 }
175 }
176 }
177
178 // Stopping criterion.
179 if (changes == 0 &&
180 it != 0) { // First iteration needed (to update bounds).
181 break;
182 }
183
184 // Step IV.
185 Arrays.fill(means, null);
186 for (int i = 0; i < n; i++) {
187 if (means[partitions[i]] == null) {
188 means[partitions[i]] = new VectorialMean(dim);
189 }
190 means[partitions[i]].increment(pointsList.get(i).getPoint());
191 }
192
193 for (int i = 0; i < k; i++) {
194 deltas[i] = distance(centers[i], means[i].getResult());
195 centers[i] = means[i].getResult();
196 }
197
198 updateBounds(partitions, u, l, deltas);
199 }
200
201 return buildResults(pointsList, partitions, centers);
202 }
203
204 /**
205 * kmeans++ seeding which provides guarantee of resulting with log(k) approximation
206 * for final clustering results
207 * <p>
208 * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding."
209 * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
210 * Society for Industrial and Applied Mathematics, 2007.
211 *
212 * @param points input data points
213 * @return an array of initial clusters centers
214 *
215 */
216 private double[][] seed(final List<T> points) {
217 final int k = getNumberOfClusters();
218 final UniformRandomProvider random = getRandomGenerator();
219
220 final double[][] result = new double[k][];
221 final int n = points.size();
222 final int pointIndex = random.nextInt(n);
223
224 final double[] minDistances = new double[n];
225
226 int idx = 0;
227 result[idx] = points.get(pointIndex).getPoint();
228
229 double sumSqDist = 0;
230
231 for (int i = 0; i < n; i++) {
232 final double d = distance(result[idx], points.get(i).getPoint());
233 minDistances[i] = d * d;
234 sumSqDist += minDistances[i];
235 }
236
237 while (++idx < k) {
238 final double p = sumSqDist * random.nextDouble();
239 int next = 0;
240 for (double cdf = 0; cdf < p; next++) {
241 cdf += minDistances[next];
242 }
243
244 result[idx] = points.get(next - 1).getPoint();
245 for (int i = 0; i < n; i++) {
246 final double d = distance(result[idx], points.get(i).getPoint());
247 sumSqDist -= minDistances[i];
248 minDistances[i] = Math.min(minDistances[i], d * d);
249 sumSqDist += minDistances[i];
250 }
251 }
252
253 return result;
254 }
255
256
257 /**
258 * Once initial centers are chosen, we can actually go through data points and assign points to the
259 * cluster based on the distance between initial centers and points.
260 *
261 * @param pointsList data points list
262 * @param centers current clusters centers
263 * @param u points upper bounds
264 * @param l lower bounds for points to clusters centers
265 *
266 * @return initial assignment of points into clusters
267 */
268 private int[] partitionPoints(List<T> pointsList,
269 double[][] centers,
270 double[] u,
271 double[][] l) {
272 final int k = getNumberOfClusters();
273 final int n = pointsList.size();
274 // Points assignments vector.
275 final int[] assignments = new int[n];
276 Arrays.fill(assignments, -1);
277 // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c)
278 for (int i = 0; i < n; i++) {
279 final double[] x = pointsList.get(i).getPoint();
280 for (int j = 0; j < k; j++) {
281 l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c)
282 if (u[i] > l[i][j]) {
283 u[i] = l[i][j]; // u(x) = min_c d(x, c)
284 assignments[i] = j; // c(x) = argmin_c d(x, c)
285 }
286 }
287 }
288 return assignments;
289 }
290
291 /**
292 * Updated distances between clusters centers and for each cluster
293 * pick the closest neighbour and keep distance to it.
294 *
295 * @param centers cluster centers
296 * @param dcc matrix of distance between clusters centers, e.g.
297 * {@code dcc[i][j] = distance(centers[i], centers[j])}
298 * @param s For a given cluster, {@code s[si]} holds distance value
299 * to the closest cluster center.
300 */
301 private void updateIntraCentersDistances(double[][] centers,
302 double[][] dcc,
303 double[] s) {
304 final int k = getNumberOfClusters();
305 for (int i = 0; i < k; i++) {
306 // Since distance(xi, xj) == distance(xj, xi), we need to update
307 // only upper or lower triangle of the distances matrix and mirror
308 // to the lower of upper triangle accordingly, trace has to be all
309 // zeros, since distance(xi, xi) == 0.
310 for (int j = i + 1; j < k; j++) {
311 dcc[i][j] = 0.5 * distance(centers[i], centers[j]);
312 dcc[j][i] = dcc[i][j];
313 if (dcc[i][j] < s[i]) {
314 s[i] = dcc[i][j];
315 }
316 if (dcc[j][i] < s[j]) {
317 s[j] = dcc[j][i];
318 }
319 }
320 }
321 }
322
323 /**
324 * For given points and and cluster, check condition (3) of Elkan algorithm.
325 *
326 * <ul>
327 * <li>c is not the cluster xi assigned to</li>
328 * <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than
329 * lower bound between xi and some cluster c</li>
330 * <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than
331 * distance between center of xi's cluster and c</li>
332 * </ul>
333 *
334 * @param partitions current partition of points into clusters
335 * @param u upper bounds for points
336 * @param l lower bounds for distance between cluster centers and points
337 * @param dcc matrix of distance between clusters centers
338 * @param xi index of the point
339 * @param c index of the cluster
340 * @return true if conditions above satisfied false otherwise
341 */
342 private static boolean isSkipNext(int[] partitions,
343 double[] u,
344 double[][] l,
345 double[][] dcc,
346 int xi,
347 int c) {
348 return c == partitions[xi] ||
349 u[xi] <= l[xi][c] ||
350 u[xi] <= dcc[partitions[xi]][c];
351 }
352
353 /**
354 * Once kmeans iterations have been converged and no more movements, we can build up the final
355 * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based
356 * on the converged partitioning.
357 *
358 * @param pointsList list of data points
359 * @param partitions current partition of points into clusters
360 * @param centers cluster centers
361 * @return cluster partitioning
362 */
363 private List<CentroidCluster<T>> buildResults(List<T> pointsList,
364 int[] partitions,
365 double[][] centers) {
366 final int k = getNumberOfClusters();
367 final List<CentroidCluster<T>> result = new ArrayList<>();
368 for (int i = 0; i < k; i++) {
369 final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i]));
370 result.add(cluster);
371 }
372 for (int i = 0; i < pointsList.size(); i++) {
373 result.get(partitions[i]).addPoint(pointsList.get(i));
374 }
375 return result;
376 }
377
378 /**
379 * Based on the distance that cluster center has moved we need to update our upper and lower bound.
380 * Worst case assumption, the center of the assigned to given cluster moves away from the point, while
381 * centers of over clusters become closer.
382 *
383 * @param partitions current points assiments to the clusters
384 * @param u points upper bounds
385 * @param l lower bounds for distances between point and corresponding cluster
386 * @param deltas the movement delta for each cluster center
387 */
388 private void updateBounds(int[] partitions,
389 double[] u,
390 double[][] l,
391 double[] deltas) {
392 final int k = getNumberOfClusters();
393 for (int i = 0; i < partitions.length; i++) {
394 u[i] += deltas[partitions[i]];
395 for (int j = 0; j < k; j++) {
396 l[i][j] = Math.max(0, l[i][j] - deltas[j]);
397 }
398 }
399 }
400
401 /**
402 * @param a Coordinates.
403 * @param b Coordinates.
404 * @return the distance between {@code a} and {@code b}.
405 */
406 private double distance(final double[] a,
407 final double[] b) {
408 return getDistanceMeasure().compute(a, b);
409 }
410 }