001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math3.stat.clustering;
018
019import java.util.ArrayList;
020import java.util.Collection;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.List;
024import java.util.Map;
025import java.util.Set;
026
027import org.apache.commons.math3.exception.NotPositiveException;
028import org.apache.commons.math3.exception.NullArgumentException;
029import org.apache.commons.math3.util.MathUtils;
030
031/**
032 * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
033 * <p>
034 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
035 * a point p is density connected to another point q, if there exists a chain of
036 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
037 * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
038 * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
039 * of this point.
040 * <p>
041 * Any point that is not density-reachable from a formed cluster is treated as noise, and
042 * will thus not be present in the result.
043 * <p>
044 * The algorithm requires two parameters:
045 * <ul>
046 *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
047 *   <li>minPoints: the minimum number of density-connected points required to form a cluster
048 * </ul>
049 * <p>
050 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
051 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
052 * return {@code null}.
053 *
054 * @param <T> type of the points to cluster
055 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
056 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
057 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
058 * @since 3.1
059 * @deprecated As of 3.2 (to be removed in 4.0),
060 * use {@link org.apache.commons.math3.ml.clustering.DBSCANClusterer} instead
061 */
062@Deprecated
063public class DBSCANClusterer<T extends Clusterable<T>> {
064
065    /** Maximum radius of the neighborhood to be considered. */
066    private final double              eps;
067
068    /** Minimum number of points needed for a cluster. */
069    private final int                 minPts;
070
071    /** Status of a point during the clustering process. */
072    private enum PointStatus {
073        /** The point has is considered to be noise. */
074        NOISE,
075        /** The point is already part of a cluster. */
076        PART_OF_CLUSTER
077    }
078
079    /**
080     * Creates a new instance of a DBSCANClusterer.
081     *
082     * @param eps maximum radius of the neighborhood to be considered
083     * @param minPts minimum number of points needed for a cluster
084     * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
085     */
086    public DBSCANClusterer(final double eps, final int minPts)
087        throws NotPositiveException {
088        if (eps < 0.0d) {
089            throw new NotPositiveException(eps);
090        }
091        if (minPts < 0) {
092            throw new NotPositiveException(minPts);
093        }
094        this.eps = eps;
095        this.minPts = minPts;
096    }
097
098    /**
099     * Returns the maximum radius of the neighborhood to be considered.
100     *
101     * @return maximum radius of the neighborhood
102     */
103    public double getEps() {
104        return eps;
105    }
106
107    /**
108     * Returns the minimum number of points needed for a cluster.
109     *
110     * @return minimum number of points needed for a cluster
111     */
112    public int getMinPts() {
113        return minPts;
114    }
115
116    /**
117     * Performs DBSCAN cluster analysis.
118     * <p>
119     * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
120     * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
121     * return {@code null}.
122     *
123     * @param points the points to cluster
124     * @return the list of clusters
125     * @throws NullArgumentException if the data points are null
126     */
127    public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
128
129        // sanity checks
130        MathUtils.checkNotNull(points);
131
132        final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
133        final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>();
134
135        for (final T point : points) {
136            if (visited.get(point) != null) {
137                continue;
138            }
139            final List<T> neighbors = getNeighbors(point, points);
140            if (neighbors.size() >= minPts) {
141                // DBSCAN does not care about center points
142                final Cluster<T> cluster = new Cluster<T>(null);
143                clusters.add(expandCluster(cluster, point, neighbors, points, visited));
144            } else {
145                visited.put(point, PointStatus.NOISE);
146            }
147        }
148
149        return clusters;
150    }
151
152    /**
153     * Expands the cluster to include density-reachable items.
154     *
155     * @param cluster Cluster to expand
156     * @param point Point to add to cluster
157     * @param neighbors List of neighbors
158     * @param points the data set
159     * @param visited the set of already visited points
160     * @return the expanded cluster
161     */
162    private Cluster<T> expandCluster(final Cluster<T> cluster,
163                                     final T point,
164                                     final List<T> neighbors,
165                                     final Collection<T> points,
166                                     final Map<Clusterable<T>, PointStatus> visited) {
167        cluster.addPoint(point);
168        visited.put(point, PointStatus.PART_OF_CLUSTER);
169
170        List<T> seeds = new ArrayList<T>(neighbors);
171        int index = 0;
172        while (index < seeds.size()) {
173            final T current = seeds.get(index);
174            PointStatus pStatus = visited.get(current);
175            // only check non-visited points
176            if (pStatus == null) {
177                final List<T> currentNeighbors = getNeighbors(current, points);
178                if (currentNeighbors.size() >= minPts) {
179                    seeds = merge(seeds, currentNeighbors);
180                }
181            }
182
183            if (pStatus != PointStatus.PART_OF_CLUSTER) {
184                visited.put(current, PointStatus.PART_OF_CLUSTER);
185                cluster.addPoint(current);
186            }
187
188            index++;
189        }
190        return cluster;
191    }
192
193    /**
194     * Returns a list of density-reachable neighbors of a {@code point}.
195     *
196     * @param point the point to look for
197     * @param points possible neighbors
198     * @return the List of neighbors
199     */
200    private List<T> getNeighbors(final T point, final Collection<T> points) {
201        final List<T> neighbors = new ArrayList<T>();
202        for (final T neighbor : points) {
203            if (point != neighbor && neighbor.distanceFrom(point) <= eps) {
204                neighbors.add(neighbor);
205            }
206        }
207        return neighbors;
208    }
209
210    /**
211     * Merges two lists together.
212     *
213     * @param one first list
214     * @param two second list
215     * @return merged lists
216     */
217    private List<T> merge(final List<T> one, final List<T> two) {
218        final Set<T> oneSet = new HashSet<T>(one);
219        for (T item : two) {
220            if (!oneSet.contains(item)) {
221                one.add(item);
222            }
223        }
224        return one;
225    }
226}