ElkanKMeansPlusPlusClusterer.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.commons.math4.legacy.ml.clustering;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collection;
- import java.util.List;
- import org.apache.commons.rng.UniformRandomProvider;
- import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException;
- import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
- import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean;
- /**
- * Implementation of k-means++ algorithm.
- * It is based on
- * <blockquote>
- * Elkan, Charles.
- * "Using the triangle inequality to accelerate k-means."
- * ICML. Vol. 3. 2003.
- * </blockquote>
- *
- * <p>
- * Algorithm uses triangle inequality to speed up computation, by reducing
- * the amount of distances calculations. Towards the last iterations of
- * the algorithm, points which already assigned to some cluster are unlikely
- * to move to a new cluster; updates of cluster centers are also usually
- * relatively small.
- * Triangle inequality is thus used to determine the cases where distance
- * computation could be skipped since center move only a little, without
- * affecting points partitioning.
- *
- * <p>
- * For initial centers seeding, we apply the algorithm described in
- * <blockquote>
- * Arthur, David, and Sergei Vassilvitskii.
- * "k-means++: The advantages of careful seeding."
- * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
- * Society for Industrial and Applied Mathematics, 2007.
- * </blockquote>
- *
- * @param <T> Type of the points to cluster.
- */
- public class ElkanKMeansPlusPlusClusterer<T extends Clusterable>
- extends KMeansPlusPlusClusterer<T> {
- /**
- * @param k Clustering parameter.
- */
- public ElkanKMeansPlusPlusClusterer(int k) {
- super(k);
- }
- /**
- * @param k Clustering parameter.
- * @param maxIterations Allowed number of iterations.
- * @param measure Distance measure.
- * @param random Random generator.
- */
- public ElkanKMeansPlusPlusClusterer(int k,
- int maxIterations,
- DistanceMeasure measure,
- UniformRandomProvider random) {
- super(k, maxIterations, measure, random);
- }
- /**
- * @param k Clustering parameter.
- * @param maxIterations Allowed number of iterations.
- * @param measure Distance measure.
- * @param random Random generator.
- * @param emptyStrategy Strategy for handling empty clusters that
- * may appear during algorithm progress.
- */
- public ElkanKMeansPlusPlusClusterer(int k,
- int maxIterations,
- DistanceMeasure measure,
- UniformRandomProvider random,
- EmptyClusterStrategy emptyStrategy) {
- super(k, maxIterations, measure, random, emptyStrategy);
- }
- /** {@inheritDoc} */
- @Override
- public List<CentroidCluster<T>> cluster(final Collection<T> points) {
- final int k = getNumberOfClusters();
- // Number of clusters has to be smaller or equal the number of data points.
- if (points.size() < k) {
- throw new NumberIsTooSmallException(points.size(), k, false);
- }
- final List<T> pointsList = new ArrayList<>(points);
- final int n = points.size();
- final int dim = pointsList.get(0).getPoint().length;
- // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is
- // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c)
- final double[] s = new double[k];
- Arrays.fill(s, Double.MAX_VALUE);
- // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2)
- final double[][] dcc = new double[k][k];
- // For each point keeps the upper bound distance to the cluster center.
- final double[] u = new double[n];
- Arrays.fill(u, Double.MAX_VALUE);
- // For each point and for each cluster keeps the lower bound for the distance between the point and cluster
- final double[][] l = new double[n][k];
- // Seed initial set of cluster centers.
- final double[][] centers = seed(pointsList);
- // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates
- // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c).
- final int[] partitions = partitionPoints(pointsList, centers, u, l);
- final double[] deltas = new double[k];
- VectorialMean[] means = new VectorialMean[k];
- for (int it = 0, max = getMaxIterations();
- it < max;
- it++) {
- int changes = 0;
- // Step I.
- // Compute inter-cluster distances.
- updateIntraCentersDistances(centers, dcc, s);
- for (int xi = 0; xi < n; xi++) {
- boolean r = true;
- // Step II.
- if (u[xi] <= s[partitions[xi]]) {
- continue;
- }
- for (int c = 0; c < k; c++) {
- // Check condition III.
- if (isSkipNext(partitions, u, l, dcc, xi, c)) {
- continue;
- }
- final double[] x = pointsList.get(xi).getPoint();
- // III(a)
- if (r) {
- u[xi] = distance(x, centers[partitions[xi]]);
- l[xi][partitions[xi]] = u[xi];
- r = false;
- }
- // III(b)
- if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) {
- l[xi][c] = distance(x, centers[c]);
- if (l[xi][c] < u[xi]) {
- partitions[xi] = c;
- u[xi] = l[xi][c];
- ++changes;
- }
- }
- }
- }
- // Stopping criterion.
- if (changes == 0 &&
- it != 0) { // First iteration needed (to update bounds).
- break;
- }
- // Step IV.
- Arrays.fill(means, null);
- for (int i = 0; i < n; i++) {
- if (means[partitions[i]] == null) {
- means[partitions[i]] = new VectorialMean(dim);
- }
- means[partitions[i]].increment(pointsList.get(i).getPoint());
- }
- for (int i = 0; i < k; i++) {
- deltas[i] = distance(centers[i], means[i].getResult());
- centers[i] = means[i].getResult();
- }
- updateBounds(partitions, u, l, deltas);
- }
- return buildResults(pointsList, partitions, centers);
- }
- /**
- * kmeans++ seeding which provides guarantee of resulting with log(k) approximation
- * for final clustering results
- * <p>
- * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding."
- * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
- * Society for Industrial and Applied Mathematics, 2007.
- *
- * @param points input data points
- * @return an array of initial clusters centers
- *
- */
- private double[][] seed(final List<T> points) {
- final int k = getNumberOfClusters();
- final UniformRandomProvider random = getRandomGenerator();
- final double[][] result = new double[k][];
- final int n = points.size();
- final int pointIndex = random.nextInt(n);
- final double[] minDistances = new double[n];
- int idx = 0;
- result[idx] = points.get(pointIndex).getPoint();
- double sumSqDist = 0;
- for (int i = 0; i < n; i++) {
- final double d = distance(result[idx], points.get(i).getPoint());
- minDistances[i] = d * d;
- sumSqDist += minDistances[i];
- }
- while (++idx < k) {
- final double p = sumSqDist * random.nextDouble();
- int next = 0;
- for (double cdf = 0; cdf < p; next++) {
- cdf += minDistances[next];
- }
- result[idx] = points.get(next - 1).getPoint();
- for (int i = 0; i < n; i++) {
- final double d = distance(result[idx], points.get(i).getPoint());
- sumSqDist -= minDistances[i];
- minDistances[i] = Math.min(minDistances[i], d * d);
- sumSqDist += minDistances[i];
- }
- }
- return result;
- }
- /**
- * Once initial centers are chosen, we can actually go through data points and assign points to the
- * cluster based on the distance between initial centers and points.
- *
- * @param pointsList data points list
- * @param centers current clusters centers
- * @param u points upper bounds
- * @param l lower bounds for points to clusters centers
- *
- * @return initial assignment of points into clusters
- */
- private int[] partitionPoints(List<T> pointsList,
- double[][] centers,
- double[] u,
- double[][] l) {
- final int k = getNumberOfClusters();
- final int n = pointsList.size();
- // Points assignments vector.
- final int[] assignments = new int[n];
- Arrays.fill(assignments, -1);
- // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c)
- for (int i = 0; i < n; i++) {
- final double[] x = pointsList.get(i).getPoint();
- for (int j = 0; j < k; j++) {
- l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c)
- if (u[i] > l[i][j]) {
- u[i] = l[i][j]; // u(x) = min_c d(x, c)
- assignments[i] = j; // c(x) = argmin_c d(x, c)
- }
- }
- }
- return assignments;
- }
- /**
- * Updated distances between clusters centers and for each cluster
- * pick the closest neighbour and keep distance to it.
- *
- * @param centers cluster centers
- * @param dcc matrix of distance between clusters centers, e.g.
- * {@code dcc[i][j] = distance(centers[i], centers[j])}
- * @param s For a given cluster, {@code s[si]} holds distance value
- * to the closest cluster center.
- */
- private void updateIntraCentersDistances(double[][] centers,
- double[][] dcc,
- double[] s) {
- final int k = getNumberOfClusters();
- for (int i = 0; i < k; i++) {
- // Since distance(xi, xj) == distance(xj, xi), we need to update
- // only upper or lower triangle of the distances matrix and mirror
- // to the lower of upper triangle accordingly, trace has to be all
- // zeros, since distance(xi, xi) == 0.
- for (int j = i + 1; j < k; j++) {
- dcc[i][j] = 0.5 * distance(centers[i], centers[j]);
- dcc[j][i] = dcc[i][j];
- if (dcc[i][j] < s[i]) {
- s[i] = dcc[i][j];
- }
- if (dcc[j][i] < s[j]) {
- s[j] = dcc[j][i];
- }
- }
- }
- }
- /**
- * For given points and and cluster, check condition (3) of Elkan algorithm.
- *
- * <ul>
- * <li>c is not the cluster xi assigned to</li>
- * <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than
- * lower bound between xi and some cluster c</li>
- * <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than
- * distance between center of xi's cluster and c</li>
- * </ul>
- *
- * @param partitions current partition of points into clusters
- * @param u upper bounds for points
- * @param l lower bounds for distance between cluster centers and points
- * @param dcc matrix of distance between clusters centers
- * @param xi index of the point
- * @param c index of the cluster
- * @return true if conditions above satisfied false otherwise
- */
- private static boolean isSkipNext(int[] partitions,
- double[] u,
- double[][] l,
- double[][] dcc,
- int xi,
- int c) {
- return c == partitions[xi] ||
- u[xi] <= l[xi][c] ||
- u[xi] <= dcc[partitions[xi]][c];
- }
- /**
- * Once kmeans iterations have been converged and no more movements, we can build up the final
- * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based
- * on the converged partitioning.
- *
- * @param pointsList list of data points
- * @param partitions current partition of points into clusters
- * @param centers cluster centers
- * @return cluster partitioning
- */
- private List<CentroidCluster<T>> buildResults(List<T> pointsList,
- int[] partitions,
- double[][] centers) {
- final int k = getNumberOfClusters();
- final List<CentroidCluster<T>> result = new ArrayList<>();
- for (int i = 0; i < k; i++) {
- final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i]));
- result.add(cluster);
- }
- for (int i = 0; i < pointsList.size(); i++) {
- result.get(partitions[i]).addPoint(pointsList.get(i));
- }
- return result;
- }
- /**
- * Based on the distance that cluster center has moved we need to update our upper and lower bound.
- * Worst case assumption, the center of the assigned to given cluster moves away from the point, while
- * centers of over clusters become closer.
- *
- * @param partitions current points assiments to the clusters
- * @param u points upper bounds
- * @param l lower bounds for distances between point and corresponding cluster
- * @param deltas the movement delta for each cluster center
- */
- private void updateBounds(int[] partitions,
- double[] u,
- double[][] l,
- double[] deltas) {
- final int k = getNumberOfClusters();
- for (int i = 0; i < partitions.length; i++) {
- u[i] += deltas[partitions[i]];
- for (int j = 0; j < k; j++) {
- l[i][j] = Math.max(0, l[i][j] - deltas[j]);
- }
- }
- }
- /**
- * @param a Coordinates.
- * @param b Coordinates.
- * @return the distance between {@code a} and {@code b}.
- */
- private double distance(final double[] a,
- final double[] b) {
- return getDistanceMeasure().compute(a, b);
- }
- }