001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math4.legacy.ml.clustering; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collection; 023import java.util.List; 024 025import org.apache.commons.rng.UniformRandomProvider; 026import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException; 027import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure; 028import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean; 029 030/** 031 * Implementation of k-means++ algorithm. 032 * It is based on 033 * <blockquote> 034 * Elkan, Charles. 035 * "Using the triangle inequality to accelerate k-means." 036 * ICML. Vol. 3. 2003. 037 * </blockquote> 038 * 039 * <p> 040 * Algorithm uses triangle inequality to speed up computation, by reducing 041 * the amount of distances calculations. Towards the last iterations of 042 * the algorithm, points which already assigned to some cluster are unlikely 043 * to move to a new cluster; updates of cluster centers are also usually 044 * relatively small. 045 * Triangle inequality is thus used to determine the cases where distance 046 * computation could be skipped since center move only a little, without 047 * affecting points partitioning. 048 * 049 * <p> 050 * For initial centers seeding, we apply the algorithm described in 051 * <blockquote> 052 * Arthur, David, and Sergei Vassilvitskii. 053 * "k-means++: The advantages of careful seeding." 054 * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. 055 * Society for Industrial and Applied Mathematics, 2007. 056 * </blockquote> 057 * 058 * @param <T> Type of the points to cluster. 059 */ 060public class ElkanKMeansPlusPlusClusterer<T extends Clusterable> 061 extends KMeansPlusPlusClusterer<T> { 062 063 /** 064 * @param k Clustering parameter. 065 */ 066 public ElkanKMeansPlusPlusClusterer(int k) { 067 super(k); 068 } 069 070 /** 071 * @param k Clustering parameter. 072 * @param maxIterations Allowed number of iterations. 073 * @param measure Distance measure. 074 * @param random Random generator. 075 */ 076 public ElkanKMeansPlusPlusClusterer(int k, 077 int maxIterations, 078 DistanceMeasure measure, 079 UniformRandomProvider random) { 080 super(k, maxIterations, measure, random); 081 } 082 083 /** 084 * @param k Clustering parameter. 085 * @param maxIterations Allowed number of iterations. 086 * @param measure Distance measure. 087 * @param random Random generator. 088 * @param emptyStrategy Strategy for handling empty clusters that 089 * may appear during algorithm progress. 090 */ 091 public ElkanKMeansPlusPlusClusterer(int k, 092 int maxIterations, 093 DistanceMeasure measure, 094 UniformRandomProvider random, 095 EmptyClusterStrategy emptyStrategy) { 096 super(k, maxIterations, measure, random, emptyStrategy); 097 } 098 099 /** {@inheritDoc} */ 100 @Override 101 public List<CentroidCluster<T>> cluster(final Collection<T> points) { 102 final int k = getNumberOfClusters(); 103 104 // Number of clusters has to be smaller or equal the number of data points. 105 if (points.size() < k) { 106 throw new NumberIsTooSmallException(points.size(), k, false); 107 } 108 109 final List<T> pointsList = new ArrayList<>(points); 110 final int n = points.size(); 111 final int dim = pointsList.get(0).getPoint().length; 112 113 // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is 114 // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c) 115 final double[] s = new double[k]; 116 Arrays.fill(s, Double.MAX_VALUE); 117 // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2) 118 final double[][] dcc = new double[k][k]; 119 120 // For each point keeps the upper bound distance to the cluster center. 121 final double[] u = new double[n]; 122 Arrays.fill(u, Double.MAX_VALUE); 123 124 // For each point and for each cluster keeps the lower bound for the distance between the point and cluster 125 final double[][] l = new double[n][k]; 126 127 // Seed initial set of cluster centers. 128 final double[][] centers = seed(pointsList); 129 130 // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates 131 // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c). 132 final int[] partitions = partitionPoints(pointsList, centers, u, l); 133 134 final double[] deltas = new double[k]; 135 VectorialMean[] means = new VectorialMean[k]; 136 for (int it = 0, max = getMaxIterations(); 137 it < max; 138 it++) { 139 int changes = 0; 140 // Step I. 141 // Compute inter-cluster distances. 142 updateIntraCentersDistances(centers, dcc, s); 143 144 for (int xi = 0; xi < n; xi++) { 145 boolean r = true; 146 147 // Step II. 148 if (u[xi] <= s[partitions[xi]]) { 149 continue; 150 } 151 152 for (int c = 0; c < k; c++) { 153 // Check condition III. 154 if (isSkipNext(partitions, u, l, dcc, xi, c)) { 155 continue; 156 } 157 158 final double[] x = pointsList.get(xi).getPoint(); 159 160 // III(a) 161 if (r) { 162 u[xi] = distance(x, centers[partitions[xi]]); 163 l[xi][partitions[xi]] = u[xi]; 164 r = false; 165 } 166 // III(b) 167 if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) { 168 l[xi][c] = distance(x, centers[c]); 169 if (l[xi][c] < u[xi]) { 170 partitions[xi] = c; 171 u[xi] = l[xi][c]; 172 ++changes; 173 } 174 } 175 } 176 } 177 178 // Stopping criterion. 179 if (changes == 0 && 180 it != 0) { // First iteration needed (to update bounds). 181 break; 182 } 183 184 // Step IV. 185 Arrays.fill(means, null); 186 for (int i = 0; i < n; i++) { 187 if (means[partitions[i]] == null) { 188 means[partitions[i]] = new VectorialMean(dim); 189 } 190 means[partitions[i]].increment(pointsList.get(i).getPoint()); 191 } 192 193 for (int i = 0; i < k; i++) { 194 deltas[i] = distance(centers[i], means[i].getResult()); 195 centers[i] = means[i].getResult(); 196 } 197 198 updateBounds(partitions, u, l, deltas); 199 } 200 201 return buildResults(pointsList, partitions, centers); 202 } 203 204 /** 205 * kmeans++ seeding which provides guarantee of resulting with log(k) approximation 206 * for final clustering results 207 * <p> 208 * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding." 209 * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. 210 * Society for Industrial and Applied Mathematics, 2007. 211 * 212 * @param points input data points 213 * @return an array of initial clusters centers 214 * 215 */ 216 private double[][] seed(final List<T> points) { 217 final int k = getNumberOfClusters(); 218 final UniformRandomProvider random = getRandomGenerator(); 219 220 final double[][] result = new double[k][]; 221 final int n = points.size(); 222 final int pointIndex = random.nextInt(n); 223 224 final double[] minDistances = new double[n]; 225 226 int idx = 0; 227 result[idx] = points.get(pointIndex).getPoint(); 228 229 double sumSqDist = 0; 230 231 for (int i = 0; i < n; i++) { 232 final double d = distance(result[idx], points.get(i).getPoint()); 233 minDistances[i] = d * d; 234 sumSqDist += minDistances[i]; 235 } 236 237 while (++idx < k) { 238 final double p = sumSqDist * random.nextDouble(); 239 int next = 0; 240 for (double cdf = 0; cdf < p; next++) { 241 cdf += minDistances[next]; 242 } 243 244 result[idx] = points.get(next - 1).getPoint(); 245 for (int i = 0; i < n; i++) { 246 final double d = distance(result[idx], points.get(i).getPoint()); 247 sumSqDist -= minDistances[i]; 248 minDistances[i] = Math.min(minDistances[i], d * d); 249 sumSqDist += minDistances[i]; 250 } 251 } 252 253 return result; 254 } 255 256 257 /** 258 * Once initial centers are chosen, we can actually go through data points and assign points to the 259 * cluster based on the distance between initial centers and points. 260 * 261 * @param pointsList data points list 262 * @param centers current clusters centers 263 * @param u points upper bounds 264 * @param l lower bounds for points to clusters centers 265 * 266 * @return initial assignment of points into clusters 267 */ 268 private int[] partitionPoints(List<T> pointsList, 269 double[][] centers, 270 double[] u, 271 double[][] l) { 272 final int k = getNumberOfClusters(); 273 final int n = pointsList.size(); 274 // Points assignments vector. 275 final int[] assignments = new int[n]; 276 Arrays.fill(assignments, -1); 277 // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c) 278 for (int i = 0; i < n; i++) { 279 final double[] x = pointsList.get(i).getPoint(); 280 for (int j = 0; j < k; j++) { 281 l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c) 282 if (u[i] > l[i][j]) { 283 u[i] = l[i][j]; // u(x) = min_c d(x, c) 284 assignments[i] = j; // c(x) = argmin_c d(x, c) 285 } 286 } 287 } 288 return assignments; 289 } 290 291 /** 292 * Updated distances between clusters centers and for each cluster 293 * pick the closest neighbour and keep distance to it. 294 * 295 * @param centers cluster centers 296 * @param dcc matrix of distance between clusters centers, e.g. 297 * {@code dcc[i][j] = distance(centers[i], centers[j])} 298 * @param s For a given cluster, {@code s[si]} holds distance value 299 * to the closest cluster center. 300 */ 301 private void updateIntraCentersDistances(double[][] centers, 302 double[][] dcc, 303 double[] s) { 304 final int k = getNumberOfClusters(); 305 for (int i = 0; i < k; i++) { 306 // Since distance(xi, xj) == distance(xj, xi), we need to update 307 // only upper or lower triangle of the distances matrix and mirror 308 // to the lower of upper triangle accordingly, trace has to be all 309 // zeros, since distance(xi, xi) == 0. 310 for (int j = i + 1; j < k; j++) { 311 dcc[i][j] = 0.5 * distance(centers[i], centers[j]); 312 dcc[j][i] = dcc[i][j]; 313 if (dcc[i][j] < s[i]) { 314 s[i] = dcc[i][j]; 315 } 316 if (dcc[j][i] < s[j]) { 317 s[j] = dcc[j][i]; 318 } 319 } 320 } 321 } 322 323 /** 324 * For given points and and cluster, check condition (3) of Elkan algorithm. 325 * 326 * <ul> 327 * <li>c is not the cluster xi assigned to</li> 328 * <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than 329 * lower bound between xi and some cluster c</li> 330 * <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than 331 * distance between center of xi's cluster and c</li> 332 * </ul> 333 * 334 * @param partitions current partition of points into clusters 335 * @param u upper bounds for points 336 * @param l lower bounds for distance between cluster centers and points 337 * @param dcc matrix of distance between clusters centers 338 * @param xi index of the point 339 * @param c index of the cluster 340 * @return true if conditions above satisfied false otherwise 341 */ 342 private static boolean isSkipNext(int[] partitions, 343 double[] u, 344 double[][] l, 345 double[][] dcc, 346 int xi, 347 int c) { 348 return c == partitions[xi] || 349 u[xi] <= l[xi][c] || 350 u[xi] <= dcc[partitions[xi]][c]; 351 } 352 353 /** 354 * Once kmeans iterations have been converged and no more movements, we can build up the final 355 * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based 356 * on the converged partitioning. 357 * 358 * @param pointsList list of data points 359 * @param partitions current partition of points into clusters 360 * @param centers cluster centers 361 * @return cluster partitioning 362 */ 363 private List<CentroidCluster<T>> buildResults(List<T> pointsList, 364 int[] partitions, 365 double[][] centers) { 366 final int k = getNumberOfClusters(); 367 final List<CentroidCluster<T>> result = new ArrayList<>(); 368 for (int i = 0; i < k; i++) { 369 final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i])); 370 result.add(cluster); 371 } 372 for (int i = 0; i < pointsList.size(); i++) { 373 result.get(partitions[i]).addPoint(pointsList.get(i)); 374 } 375 return result; 376 } 377 378 /** 379 * Based on the distance that cluster center has moved we need to update our upper and lower bound. 380 * Worst case assumption, the center of the assigned to given cluster moves away from the point, while 381 * centers of over clusters become closer. 382 * 383 * @param partitions current points assiments to the clusters 384 * @param u points upper bounds 385 * @param l lower bounds for distances between point and corresponding cluster 386 * @param deltas the movement delta for each cluster center 387 */ 388 private void updateBounds(int[] partitions, 389 double[] u, 390 double[][] l, 391 double[] deltas) { 392 final int k = getNumberOfClusters(); 393 for (int i = 0; i < partitions.length; i++) { 394 u[i] += deltas[partitions[i]]; 395 for (int j = 0; j < k; j++) { 396 l[i][j] = Math.max(0, l[i][j] - deltas[j]); 397 } 398 } 399 } 400 401 /** 402 * @param a Coordinates. 403 * @param b Coordinates. 404 * @return the distance between {@code a} and {@code b}. 405 */ 406 private double distance(final double[] a, 407 final double[] b) { 408 return getDistanceMeasure().compute(a, b); 409 } 410}