001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math3.ml.clustering;
018
019import java.util.ArrayList;
020import java.util.Collection;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.List;
024import java.util.Map;
025import java.util.Set;
026
027import org.apache.commons.math3.exception.NotPositiveException;
028import org.apache.commons.math3.exception.NullArgumentException;
029import org.apache.commons.math3.ml.distance.DistanceMeasure;
030import org.apache.commons.math3.ml.distance.EuclideanDistance;
031import org.apache.commons.math3.util.MathUtils;
032
033/**
034 * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
035 * <p>
036 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
037 * a point p is density connected to another point q, if there exists a chain of
038 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
039 * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
040 * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
041 * of this point.
042 * <p>
043 * Any point that is not density-reachable from a formed cluster is treated as noise, and
044 * will thus not be present in the result.
045 * <p>
046 * The algorithm requires two parameters:
047 * <ul>
048 *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
049 *   <li>minPoints: the minimum number of density-connected points required to form a cluster
050 * </ul>
051 *
052 * @param <T> type of the points to cluster
053 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
054 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
055 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
056 * @version $Id: DBSCANClusterer.html 885258 2013-11-03 02:46:49Z tn $
057 * @since 3.2
058 */
059public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {
060
061    /** Maximum radius of the neighborhood to be considered. */
062    private final double              eps;
063
064    /** Minimum number of points needed for a cluster. */
065    private final int                 minPts;
066
067    /** Status of a point during the clustering process. */
068    private enum PointStatus {
069        /** The point has is considered to be noise. */
070        NOISE,
071        /** The point is already part of a cluster. */
072        PART_OF_CLUSTER
073    }
074
075    /**
076     * Creates a new instance of a DBSCANClusterer.
077     * <p>
078     * The euclidean distance will be used as default distance measure.
079     *
080     * @param eps maximum radius of the neighborhood to be considered
081     * @param minPts minimum number of points needed for a cluster
082     * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
083     */
084    public DBSCANClusterer(final double eps, final int minPts)
085        throws NotPositiveException {
086        this(eps, minPts, new EuclideanDistance());
087    }
088
089    /**
090     * Creates a new instance of a DBSCANClusterer.
091     *
092     * @param eps maximum radius of the neighborhood to be considered
093     * @param minPts minimum number of points needed for a cluster
094     * @param measure the distance measure to use
095     * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
096     */
097    public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure)
098        throws NotPositiveException {
099        super(measure);
100
101        if (eps < 0.0d) {
102            throw new NotPositiveException(eps);
103        }
104        if (minPts < 0) {
105            throw new NotPositiveException(minPts);
106        }
107        this.eps = eps;
108        this.minPts = minPts;
109    }
110
111    /**
112     * Returns the maximum radius of the neighborhood to be considered.
113     * @return maximum radius of the neighborhood
114     */
115    public double getEps() {
116        return eps;
117    }
118
119    /**
120     * Returns the minimum number of points needed for a cluster.
121     * @return minimum number of points needed for a cluster
122     */
123    public int getMinPts() {
124        return minPts;
125    }
126
127    /**
128     * Performs DBSCAN cluster analysis.
129     *
130     * @param points the points to cluster
131     * @return the list of clusters
132     * @throws NullArgumentException if the data points are null
133     */
134    @Override
135    public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
136
137        // sanity checks
138        MathUtils.checkNotNull(points);
139
140        final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
141        final Map<Clusterable, PointStatus> visited = new HashMap<Clusterable, PointStatus>();
142
143        for (final T point : points) {
144            if (visited.get(point) != null) {
145                continue;
146            }
147            final List<T> neighbors = getNeighbors(point, points);
148            if (neighbors.size() >= minPts) {
149                // DBSCAN does not care about center points
150                final Cluster<T> cluster = new Cluster<T>();
151                clusters.add(expandCluster(cluster, point, neighbors, points, visited));
152            } else {
153                visited.put(point, PointStatus.NOISE);
154            }
155        }
156
157        return clusters;
158    }
159
160    /**
161     * Expands the cluster to include density-reachable items.
162     *
163     * @param cluster Cluster to expand
164     * @param point Point to add to cluster
165     * @param neighbors List of neighbors
166     * @param points the data set
167     * @param visited the set of already visited points
168     * @return the expanded cluster
169     */
170    private Cluster<T> expandCluster(final Cluster<T> cluster,
171                                     final T point,
172                                     final List<T> neighbors,
173                                     final Collection<T> points,
174                                     final Map<Clusterable, PointStatus> visited) {
175        cluster.addPoint(point);
176        visited.put(point, PointStatus.PART_OF_CLUSTER);
177
178        List<T> seeds = new ArrayList<T>(neighbors);
179        int index = 0;
180        while (index < seeds.size()) {
181            final T current = seeds.get(index);
182            PointStatus pStatus = visited.get(current);
183            // only check non-visited points
184            if (pStatus == null) {
185                final List<T> currentNeighbors = getNeighbors(current, points);
186                if (currentNeighbors.size() >= minPts) {
187                    seeds = merge(seeds, currentNeighbors);
188                }
189            }
190
191            if (pStatus != PointStatus.PART_OF_CLUSTER) {
192                visited.put(current, PointStatus.PART_OF_CLUSTER);
193                cluster.addPoint(current);
194            }
195
196            index++;
197        }
198        return cluster;
199    }
200
201    /**
202     * Returns a list of density-reachable neighbors of a {@code point}.
203     *
204     * @param point the point to look for
205     * @param points possible neighbors
206     * @return the List of neighbors
207     */
208    private List<T> getNeighbors(final T point, final Collection<T> points) {
209        final List<T> neighbors = new ArrayList<T>();
210        for (final T neighbor : points) {
211            if (point != neighbor && distance(neighbor, point) <= eps) {
212                neighbors.add(neighbor);
213            }
214        }
215        return neighbors;
216    }
217
218    /**
219     * Merges two lists together.
220     *
221     * @param one first list
222     * @param two second list
223     * @return merged lists
224     */
225    private List<T> merge(final List<T> one, final List<T> two) {
226        final Set<T> oneSet = new HashSet<T>(one);
227        for (T item : two) {
228            if (!oneSet.contains(item)) {
229                one.add(item);
230            }
231        }
232        return one;
233    }
234}