001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 * 
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 * 
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3;
018
019import java.io.Serializable;
020import java.util.Collections;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.Map;
024import java.util.Set;
025
026/**
027 * <p>A set of characters.</p>
028 *
029 * <p>Instances are immutable, but instances of subclasses may not be.</p>
030 *
031 * <p>#ThreadSafe#</p>
032 * @since 1.0
033 * @version $Id: CharSet.java 1436770 2013-01-22 07:09:45Z ggregory $
034 */
035public class CharSet implements Serializable {
036
037    /**
038     * Required for serialization support. Lang version 2.0. 
039     * 
040     * @see java.io.Serializable
041     */
042    private static final long serialVersionUID = 5947847346149275958L;
043
044    /** 
045     * A CharSet defining no characters. 
046     * @since 2.0
047     */
048    public static final CharSet EMPTY = new CharSet((String) null);
049
050    /** 
051     * A CharSet defining ASCII alphabetic characters "a-zA-Z".
052     * @since 2.0
053     */
054    public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
055
056    /** 
057     * A CharSet defining ASCII alphabetic characters "a-z".
058     * @since 2.0
059     */
060    public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
061
062    /** 
063     * A CharSet defining ASCII alphabetic characters "A-Z".
064     * @since 2.0
065     */
066    public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
067
068    /** 
069     * A CharSet defining ASCII alphabetic characters "0-9".
070     * @since 2.0
071     */
072    public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
073
074    /**
075     * A Map of the common cases used in the factory.
076     * Subclasses can add more common patterns if desired
077     * @since 2.0
078     */
079    protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<String, CharSet>());
080    
081    static {
082        COMMON.put(null, EMPTY);
083        COMMON.put("", EMPTY);
084        COMMON.put("a-zA-Z", ASCII_ALPHA);
085        COMMON.put("A-Za-z", ASCII_ALPHA);
086        COMMON.put("a-z", ASCII_ALPHA_LOWER);
087        COMMON.put("A-Z", ASCII_ALPHA_UPPER);
088        COMMON.put("0-9", ASCII_NUMERIC);
089    }
090
091    /** The set of CharRange objects. */
092    private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<CharRange>());
093
094    //-----------------------------------------------------------------------
095    /**
096     * <p>Factory method to create a new CharSet using a special syntax.</p>
097     *
098     * <ul>
099     *  <li>{@code null} or empty string ("")
100     * - set containing no characters</li>
101     *  <li>Single character, such as "a"
102     *  - set containing just that character</li>
103     *  <li>Multi character, such as "a-e"
104     *  - set containing characters from one character to the other</li>
105     *  <li>Negated, such as "^a" or "^a-e"
106     *  - set containing all characters except those defined</li>
107     *  <li>Combinations, such as "abe-g"
108     *  - set containing all the characters from the individual sets</li>
109     * </ul>
110     *
111     * <p>The matching order is:</p>
112     * <ol>
113     *  <li>Negated multi character range, such as "^a-e"
114     *  <li>Ordinary multi character range, such as "a-e"
115     *  <li>Negated single character, such as "^a"
116     *  <li>Ordinary single character, such as "a"
117     * </ol>
118     * <p>Matching works left to right. Once a match is found the
119     * search starts again from the next character.</p>
120     *
121     * <p>If the same range is defined twice using the same syntax, only
122     * one range will be kept.
123     * Thus, "a-ca-c" creates only one range of "a-c".</p>
124     *
125     * <p>If the start and end of a range are in the wrong order,
126     * they are reversed. Thus "a-e" is the same as "e-a".
127     * As a result, "a-ee-a" would create only one range,
128     * as the "a-e" and "e-a" are the same.</p>
129     *
130     * <p>The set of characters represented is the union of the specified ranges.</p>
131     *
132     * <p>All CharSet objects returned by this method will be immutable.</p>
133     *
134     * @param setStrs  Strings to merge into the set, may be null
135     * @return a CharSet instance
136     * @since 2.4
137     */
138    public static CharSet getInstance(final String... setStrs) {
139        if (setStrs == null) {
140            return null;
141        }
142        if (setStrs.length == 1) {
143            final CharSet common = COMMON.get(setStrs[0]);
144            if (common != null) {
145                return common;
146            }
147        }
148        return new CharSet(setStrs); 
149    }
150
151    //-----------------------------------------------------------------------
152    /**
153     * <p>Constructs a new CharSet using the set syntax.
154     * Each string is merged in with the set.</p>
155     *
156     * @param set  Strings to merge into the initial set
157     * @throws NullPointerException if set is {@code null}
158     */
159    protected CharSet(final String... set) {
160        super();
161        final int sz = set.length;
162        for (int i = 0; i < sz; i++) {
163            add(set[i]);
164        }
165    }
166
167    //-----------------------------------------------------------------------
168    /**
169     * <p>Add a set definition string to the {@code CharSet}.</p>
170     *
171     * @param str  set definition string
172     */
173    protected void add(final String str) {
174        if (str == null) {
175            return;
176        }
177
178        final int len = str.length();
179        int pos = 0;
180        while (pos < len) {
181            final int remainder = len - pos;
182            if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
183                // negated range
184                set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
185                pos += 4;
186            } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
187                // range
188                set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
189                pos += 3;
190            } else if (remainder >= 2 && str.charAt(pos) == '^') {
191                // negated char
192                set.add(CharRange.isNot(str.charAt(pos + 1)));
193                pos += 2;
194            } else {
195                // char
196                set.add(CharRange.is(str.charAt(pos)));
197                pos += 1;
198            }
199        }
200    }
201
202    //-----------------------------------------------------------------------
203    /**
204     * <p>Gets the internal set as an array of CharRange objects.</p>
205     *
206     * @return an array of immutable CharRange objects
207     * @since 2.0
208     */
209// NOTE: This is no longer public as CharRange is no longer a public class. 
210//       It may be replaced when CharSet moves to Range.
211    /*public*/ CharRange[] getCharRanges() {
212        return set.toArray(new CharRange[set.size()]);
213    }
214
215    //-----------------------------------------------------------------------
216    /**
217     * <p>Does the {@code CharSet} contain the specified
218     * character {@code ch}.</p>
219     *
220     * @param ch  the character to check for
221     * @return {@code true} if the set contains the characters
222     */
223    public boolean contains(final char ch) {
224        for (final CharRange range : set) {
225            if (range.contains(ch)) {
226                return true;
227            }
228        }
229        return false;
230    }
231
232    // Basics
233    //-----------------------------------------------------------------------
234    /**
235     * <p>Compares two {@code CharSet} objects, returning true if they represent
236     * exactly the same set of characters defined in the same way.</p>
237     *
238     * <p>The two sets {@code abc} and {@code a-c} are <i>not</i>
239     * equal according to this method.</p>
240     *
241     * @param obj  the object to compare to
242     * @return true if equal
243     * @since 2.0
244     */
245    @Override
246    public boolean equals(final Object obj) {
247        if (obj == this) {
248            return true;
249        }
250        if (obj instanceof CharSet == false) {
251            return false;
252        }
253        final CharSet other = (CharSet) obj;
254        return set.equals(other.set);
255    }
256
257    /**
258     * <p>Gets a hash code compatible with the equals method.</p>
259     *
260     * @return a suitable hash code
261     * @since 2.0
262     */
263    @Override
264    public int hashCode() {
265        return 89 + set.hashCode();
266    }
267
268    /**
269     * <p>Gets a string representation of the set.</p>
270     *
271     * @return string representation of the set
272     */
273    @Override
274    public String toString() {
275        return set.toString();
276    }
277
278}