001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math4.legacy.ml.clustering; 019 020import java.util.Collection; 021import java.util.List; 022 023import org.apache.commons.math4.legacy.ml.clustering.evaluation.SumOfClusterVariances; 024 025/** 026 * A wrapper around a k-means++ clustering algorithm which performs multiple trials 027 * and returns the best solution. 028 * @param <T> type of the points to cluster 029 * @since 3.2 030 */ 031public class MultiKMeansPlusPlusClusterer<T extends Clusterable> extends Clusterer<T> { 032 033 /** The underlying k-means clusterer. */ 034 private final KMeansPlusPlusClusterer<T> clusterer; 035 036 /** The number of trial runs. */ 037 private final int numTrials; 038 039 /** The cluster evaluator to use. */ 040 private final ClusterRanking evaluator; 041 042 /** Build a clusterer. 043 * @param clusterer the k-means clusterer to use 044 * @param numTrials number of trial runs 045 */ 046 public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer, 047 final int numTrials) { 048 this(clusterer, 049 numTrials, 050 ClusterEvaluator.ranking(new SumOfClusterVariances(clusterer.getDistanceMeasure()))); 051 } 052 053 /** Build a clusterer. 054 * @param clusterer the k-means clusterer to use 055 * @param numTrials number of trial runs 056 * @param evaluator the cluster evaluator to use 057 * @since 3.3 058 */ 059 public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer, 060 final int numTrials, 061 final ClusterRanking evaluator) { 062 super(clusterer.getDistanceMeasure()); 063 this.clusterer = clusterer; 064 this.numTrials = numTrials; 065 this.evaluator = evaluator; 066 } 067 068 /** 069 * Runs the K-means++ clustering algorithm. 070 * 071 * @param points the points to cluster 072 * @return a list of clusters containing the points 073 * @throws org.apache.commons.math4.legacy.exception.MathIllegalArgumentException if 074 * the data points are null or the number of clusters is larger than the 075 * number of data points 076 * @throws org.apache.commons.math4.legacy.exception.ConvergenceException if 077 * an empty cluster is encountered and the underlying {@link KMeansPlusPlusClusterer} 078 * has its {@link KMeansPlusPlusClusterer.EmptyClusterStrategy} is set to {@code ERROR}. 079 */ 080 @Override 081 public List<CentroidCluster<T>> cluster(final Collection<T> points) { 082 // at first, we have not found any clusters list yet 083 List<CentroidCluster<T>> best = null; 084 double bestRank = Double.NEGATIVE_INFINITY; 085 086 // do several clustering trials 087 for (int i = 0; i < numTrials; ++i) { 088 089 // compute a clusters list 090 List<CentroidCluster<T>> clusters = clusterer.cluster(points); 091 092 // compute the rank of the current list 093 final double rank = evaluator.compute(clusters); 094 095 if (rank > bestRank) { 096 // this one is the best we have found so far, remember it 097 best = clusters; 098 bestRank = rank; 099 } 100 } 101 102 // return the best clusters list found 103 return best; 104 } 105}