001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.math3.ml.clustering; 018 019import java.util.ArrayList; 020import java.util.Collection; 021import java.util.HashMap; 022import java.util.HashSet; 023import java.util.List; 024import java.util.Map; 025import java.util.Set; 026 027import org.apache.commons.math3.exception.NotPositiveException; 028import org.apache.commons.math3.exception.NullArgumentException; 029import org.apache.commons.math3.ml.distance.DistanceMeasure; 030import org.apache.commons.math3.ml.distance.EuclideanDistance; 031import org.apache.commons.math3.util.MathUtils; 032 033/** 034 * DBSCAN (density-based spatial clustering of applications with noise) algorithm. 035 * <p> 036 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. 037 * a point p is density connected to another point q, if there exists a chain of 038 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q, 039 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable. 040 * A point q is directly density-reachable from point p if it is in the ε-neighborhood 041 * of this point. 042 * <p> 043 * Any point that is not density-reachable from a formed cluster is treated as noise, and 044 * will thus not be present in the result. 045 * <p> 046 * The algorithm requires two parameters: 047 * <ul> 048 * <li>eps: the distance that defines the ε-neighborhood of a point 049 * <li>minPoints: the minimum number of density-connected points required to form a cluster 050 * </ul> 051 * 052 * @param <T> type of the points to cluster 053 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a> 054 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf"> 055 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a> 056 * @since 3.2 057 */ 058public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> { 059 060 /** Maximum radius of the neighborhood to be considered. */ 061 private final double eps; 062 063 /** Minimum number of points needed for a cluster. */ 064 private final int minPts; 065 066 /** Status of a point during the clustering process. */ 067 private enum PointStatus { 068 /** The point has is considered to be noise. */ 069 NOISE, 070 /** The point is already part of a cluster. */ 071 PART_OF_CLUSTER 072 } 073 074 /** 075 * Creates a new instance of a DBSCANClusterer. 076 * <p> 077 * The euclidean distance will be used as default distance measure. 078 * 079 * @param eps maximum radius of the neighborhood to be considered 080 * @param minPts minimum number of points needed for a cluster 081 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} 082 */ 083 public DBSCANClusterer(final double eps, final int minPts) 084 throws NotPositiveException { 085 this(eps, minPts, new EuclideanDistance()); 086 } 087 088 /** 089 * Creates a new instance of a DBSCANClusterer. 090 * 091 * @param eps maximum radius of the neighborhood to be considered 092 * @param minPts minimum number of points needed for a cluster 093 * @param measure the distance measure to use 094 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} 095 */ 096 public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) 097 throws NotPositiveException { 098 super(measure); 099 100 if (eps < 0.0d) { 101 throw new NotPositiveException(eps); 102 } 103 if (minPts < 0) { 104 throw new NotPositiveException(minPts); 105 } 106 this.eps = eps; 107 this.minPts = minPts; 108 } 109 110 /** 111 * Returns the maximum radius of the neighborhood to be considered. 112 * @return maximum radius of the neighborhood 113 */ 114 public double getEps() { 115 return eps; 116 } 117 118 /** 119 * Returns the minimum number of points needed for a cluster. 120 * @return minimum number of points needed for a cluster 121 */ 122 public int getMinPts() { 123 return minPts; 124 } 125 126 /** 127 * Performs DBSCAN cluster analysis. 128 * 129 * @param points the points to cluster 130 * @return the list of clusters 131 * @throws NullArgumentException if the data points are null 132 */ 133 @Override 134 public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException { 135 136 // sanity checks 137 MathUtils.checkNotNull(points); 138 139 final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>(); 140 final Map<Clusterable, PointStatus> visited = new HashMap<Clusterable, PointStatus>(); 141 142 for (final T point : points) { 143 if (visited.get(point) != null) { 144 continue; 145 } 146 final List<T> neighbors = getNeighbors(point, points); 147 if (neighbors.size() >= minPts) { 148 // DBSCAN does not care about center points 149 final Cluster<T> cluster = new Cluster<T>(); 150 clusters.add(expandCluster(cluster, point, neighbors, points, visited)); 151 } else { 152 visited.put(point, PointStatus.NOISE); 153 } 154 } 155 156 return clusters; 157 } 158 159 /** 160 * Expands the cluster to include density-reachable items. 161 * 162 * @param cluster Cluster to expand 163 * @param point Point to add to cluster 164 * @param neighbors List of neighbors 165 * @param points the data set 166 * @param visited the set of already visited points 167 * @return the expanded cluster 168 */ 169 private Cluster<T> expandCluster(final Cluster<T> cluster, 170 final T point, 171 final List<T> neighbors, 172 final Collection<T> points, 173 final Map<Clusterable, PointStatus> visited) { 174 cluster.addPoint(point); 175 visited.put(point, PointStatus.PART_OF_CLUSTER); 176 177 List<T> seeds = new ArrayList<T>(neighbors); 178 int index = 0; 179 while (index < seeds.size()) { 180 final T current = seeds.get(index); 181 PointStatus pStatus = visited.get(current); 182 // only check non-visited points 183 if (pStatus == null) { 184 final List<T> currentNeighbors = getNeighbors(current, points); 185 if (currentNeighbors.size() >= minPts) { 186 seeds = merge(seeds, currentNeighbors); 187 } 188 } 189 190 if (pStatus != PointStatus.PART_OF_CLUSTER) { 191 visited.put(current, PointStatus.PART_OF_CLUSTER); 192 cluster.addPoint(current); 193 } 194 195 index++; 196 } 197 return cluster; 198 } 199 200 /** 201 * Returns a list of density-reachable neighbors of a {@code point}. 202 * 203 * @param point the point to look for 204 * @param points possible neighbors 205 * @return the List of neighbors 206 */ 207 private List<T> getNeighbors(final T point, final Collection<T> points) { 208 final List<T> neighbors = new ArrayList<T>(); 209 for (final T neighbor : points) { 210 if (point != neighbor && distance(neighbor, point) <= eps) { 211 neighbors.add(neighbor); 212 } 213 } 214 return neighbors; 215 } 216 217 /** 218 * Merges two lists together. 219 * 220 * @param one first list 221 * @param two second list 222 * @return merged lists 223 */ 224 private List<T> merge(final List<T> one, final List<T> two) { 225 final Set<T> oneSet = new HashSet<T>(one); 226 for (T item : two) { 227 if (!oneSet.contains(item)) { 228 one.add(item); 229 } 230 } 231 return one; 232 } 233}