1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.math4.legacy.ml.clustering;
18
19 import java.util.ArrayList;
20 import java.util.Collection;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.stream.Collectors;
27
28 import org.apache.commons.math4.legacy.exception.NullArgumentException;
29 import org.apache.commons.math4.legacy.exception.NotPositiveException;
30 import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
31 import org.apache.commons.math4.legacy.ml.distance.EuclideanDistance;
32
33 /**
34 * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
35 * <p>
36 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
37 * a point p is density connected to another point q, if there exists a chain of
38 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
39 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable.
40 * A point q is directly density-reachable from point p if it is in the ε-neighborhood
41 * of this point.
42 * <p>
43 * Any point that is not density-reachable from a formed cluster is treated as noise, and
44 * will thus not be present in the result.
45 * <p>
46 * The algorithm requires two parameters:
47 * <ul>
48 * <li>eps: the distance that defines the ε-neighborhood of a point
49 * <li>minPoints: the minimum number of density-connected points required to form a cluster
50 * </ul>
51 *
52 * @param <T> type of the points to cluster
53 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
54 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
55 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
56 * @since 3.2
57 */
58 public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {
59
60 /** Maximum radius of the neighborhood to be considered. */
61 private final double eps;
62
63 /** Minimum number of points needed for a cluster. */
64 private final int minPts;
65
66 /** Status of a point during the clustering process. */
67 private enum PointStatus {
68 /** The point has is considered to be noise. */
69 NOISE,
70 /** The point is already part of a cluster. */
71 PART_OF_CLUSTER
72 }
73
74 /**
75 * Creates a new instance of a DBSCANClusterer.
76 * <p>
77 * The euclidean distance will be used as default distance measure.
78 *
79 * @param eps maximum radius of the neighborhood to be considered
80 * @param minPts minimum number of points needed for a cluster
81 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
82 */
83 public DBSCANClusterer(final double eps, final int minPts) {
84 this(eps, minPts, new EuclideanDistance());
85 }
86
87 /**
88 * Creates a new instance of a DBSCANClusterer.
89 *
90 * @param eps maximum radius of the neighborhood to be considered
91 * @param minPts minimum number of points needed for a cluster
92 * @param measure the distance measure to use
93 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
94 */
95 public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) {
96 super(measure);
97
98 if (eps < 0.0d) {
99 throw new NotPositiveException(eps);
100 }
101 if (minPts < 0) {
102 throw new NotPositiveException(minPts);
103 }
104 this.eps = eps;
105 this.minPts = minPts;
106 }
107
108 /**
109 * Returns the maximum radius of the neighborhood to be considered.
110 * @return maximum radius of the neighborhood
111 */
112 public double getEps() {
113 return eps;
114 }
115
116 /**
117 * Returns the minimum number of points needed for a cluster.
118 * @return minimum number of points needed for a cluster
119 */
120 public int getMinPts() {
121 return minPts;
122 }
123
124 /**
125 * Performs DBSCAN cluster analysis.
126 *
127 * @param points Points to cluster (cannot be {@code null}).
128 * @return the list of clusters.
129 */
130 @Override
131 public List<Cluster<T>> cluster(final Collection<T> points) {
132 // sanity checks
133 NullArgumentException.check(points);
134
135 final List<Cluster<T>> clusters = new ArrayList<>();
136 final Map<Clusterable, PointStatus> visited = new HashMap<>();
137
138 for (final T point : points) {
139 if (visited.get(point) != null) {
140 continue;
141 }
142 final List<T> neighbors = getNeighbors(point, points);
143 if (neighbors.size() >= minPts) {
144 // DBSCAN does not care about center points
145 final Cluster<T> cluster = new Cluster<>();
146 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
147 } else {
148 visited.put(point, PointStatus.NOISE);
149 }
150 }
151
152 return clusters;
153 }
154
155 /**
156 * Expands the cluster to include density-reachable items.
157 *
158 * @param cluster Cluster to expand
159 * @param point Point to add to cluster
160 * @param neighbors List of neighbors
161 * @param points the data set
162 * @param visited the set of already visited points
163 * @return the expanded cluster
164 */
165 private Cluster<T> expandCluster(final Cluster<T> cluster,
166 final T point,
167 final List<T> neighbors,
168 final Collection<T> points,
169 final Map<Clusterable, PointStatus> visited) {
170 cluster.addPoint(point);
171 visited.put(point, PointStatus.PART_OF_CLUSTER);
172
173 List<T> seeds = new ArrayList<>(neighbors);
174 int index = 0;
175 while (index < seeds.size()) {
176 final T current = seeds.get(index);
177 PointStatus pStatus = visited.get(current);
178 // only check non-visited points
179 if (pStatus == null) {
180 final List<T> currentNeighbors = getNeighbors(current, points);
181 if (currentNeighbors.size() >= minPts) {
182 seeds = merge(seeds, currentNeighbors);
183 }
184 }
185
186 if (pStatus != PointStatus.PART_OF_CLUSTER) {
187 visited.put(current, PointStatus.PART_OF_CLUSTER);
188 cluster.addPoint(current);
189 }
190
191 index++;
192 }
193 return cluster;
194 }
195
196 /**
197 * Returns a list of density-reachable neighbors of a {@code point}.
198 *
199 * @param point the point to look for
200 * @param points possible neighbors
201 * @return the List of neighbors
202 */
203 private List<T> getNeighbors(final T point, final Collection<T> points) {
204 return points.stream().filter(neighbor -> point != neighbor && distance(neighbor, point) <= eps)
205 .collect(Collectors.toList());
206 }
207
208 /**
209 * Merges two lists together.
210 *
211 * @param one first list
212 * @param two second list
213 * @return merged lists
214 */
215 private List<T> merge(final List<T> one, final List<T> two) {
216 final Set<T> oneSet = new HashSet<>(one);
217 two.stream().filter(item -> !oneSet.contains(item)).forEach(one::add);
218 return one;
219 }
220 }