001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3;
018
019import java.io.Serializable;
020import java.util.Collections;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.Map;
024import java.util.Set;
025
026/**
027 * <p>A set of characters.</p>
028 *
029 * <p>Instances are immutable, but instances of subclasses may not be.</p>
030 *
031 * <p>#ThreadSafe#</p>
032 * @since 1.0
033 */
034public class CharSet implements Serializable {
035
036    /**
037     * Required for serialization support. Lang version 2.0.
038     *
039     * @see java.io.Serializable
040     */
041    private static final long serialVersionUID = 5947847346149275958L;
042
043    /**
044     * A CharSet defining no characters.
045     * @since 2.0
046     */
047    public static final CharSet EMPTY = new CharSet((String) null);
048
049    /**
050     * A CharSet defining ASCII alphabetic characters "a-zA-Z".
051     * @since 2.0
052     */
053    public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
054
055    /**
056     * A CharSet defining ASCII alphabetic characters "a-z".
057     * @since 2.0
058     */
059    public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
060
061    /**
062     * A CharSet defining ASCII alphabetic characters "A-Z".
063     * @since 2.0
064     */
065    public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
066
067    /**
068     * A CharSet defining ASCII alphabetic characters "0-9".
069     * @since 2.0
070     */
071    public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
072
073    /**
074     * A Map of the common cases used in the factory.
075     * Subclasses can add more common patterns if desired
076     * @since 2.0
077     */
078    protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
079
080    static {
081        COMMON.put(null, EMPTY);
082        COMMON.put(StringUtils.EMPTY, EMPTY);
083        COMMON.put("a-zA-Z", ASCII_ALPHA);
084        COMMON.put("A-Za-z", ASCII_ALPHA);
085        COMMON.put("a-z", ASCII_ALPHA_LOWER);
086        COMMON.put("A-Z", ASCII_ALPHA_UPPER);
087        COMMON.put("0-9", ASCII_NUMERIC);
088    }
089
090    /** The set of CharRange objects. */
091    private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>());
092
093    //-----------------------------------------------------------------------
094    /**
095     * <p>Factory method to create a new CharSet using a special syntax.</p>
096     *
097     * <ul>
098     *  <li>{@code null} or empty string ("")
099     * - set containing no characters</li>
100     *  <li>Single character, such as "a"
101     *  - set containing just that character</li>
102     *  <li>Multi character, such as "a-e"
103     *  - set containing characters from one character to the other</li>
104     *  <li>Negated, such as "^a" or "^a-e"
105     *  - set containing all characters except those defined</li>
106     *  <li>Combinations, such as "abe-g"
107     *  - set containing all the characters from the individual sets</li>
108     * </ul>
109     *
110     * <p>The matching order is:</p>
111     * <ol>
112     *  <li>Negated multi character range, such as "^a-e"
113     *  <li>Ordinary multi character range, such as "a-e"
114     *  <li>Negated single character, such as "^a"
115     *  <li>Ordinary single character, such as "a"
116     * </ol>
117     *
118     * <p>Matching works left to right. Once a match is found the
119     * search starts again from the next character.</p>
120     *
121     * <p>If the same range is defined twice using the same syntax, only
122     * one range will be kept.
123     * Thus, "a-ca-c" creates only one range of "a-c".</p>
124     *
125     * <p>If the start and end of a range are in the wrong order,
126     * they are reversed. Thus "a-e" is the same as "e-a".
127     * As a result, "a-ee-a" would create only one range,
128     * as the "a-e" and "e-a" are the same.</p>
129     *
130     * <p>The set of characters represented is the union of the specified ranges.</p>
131     *
132     * <p>There are two ways to add a literal negation character ({@code ^}):</p>
133     * <ul>
134     *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
135     *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
136     * </ul>
137     *
138     * <p>Examples using the negation character:</p>
139     * <pre>
140     *     CharSet.getInstance("^a-c").contains('a') = false
141     *     CharSet.getInstance("^a-c").contains('d') = true
142     *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
143     *     CharSet.getInstance("^^a-c").contains('^') = false
144     *     CharSet.getInstance("^a-cd-f").contains('d') = true
145     *     CharSet.getInstance("a-c^").contains('^') = true
146     *     CharSet.getInstance("^", "a-c").contains('^') = true
147     * </pre>
148     *
149     * <p>All CharSet objects returned by this method will be immutable.</p>
150     *
151     * @param setStrs  Strings to merge into the set, may be null
152     * @return a CharSet instance
153     * @since 2.4
154     */
155    public static CharSet getInstance(final String... setStrs) {
156        if (setStrs == null) {
157            return null;
158        }
159        if (setStrs.length == 1) {
160            final CharSet common = COMMON.get(setStrs[0]);
161            if (common != null) {
162                return common;
163            }
164        }
165        return new CharSet(setStrs);
166    }
167
168    //-----------------------------------------------------------------------
169    /**
170     * <p>Constructs a new CharSet using the set syntax.
171     * Each string is merged in with the set.</p>
172     *
173     * @param set  Strings to merge into the initial set
174     * @throws NullPointerException if set is {@code null}
175     */
176    protected CharSet(final String... set) {
177        super();
178        for (final String s : set) {
179            add(s);
180        }
181    }
182
183    //-----------------------------------------------------------------------
184    /**
185     * <p>Add a set definition string to the {@code CharSet}.</p>
186     *
187     * @param str  set definition string
188     */
189    protected void add(final String str) {
190        if (str == null) {
191            return;
192        }
193
194        final int len = str.length();
195        int pos = 0;
196        while (pos < len) {
197            final int remainder = len - pos;
198            if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
199                // negated range
200                set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
201                pos += 4;
202            } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
203                // range
204                set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
205                pos += 3;
206            } else if (remainder >= 2 && str.charAt(pos) == '^') {
207                // negated char
208                set.add(CharRange.isNot(str.charAt(pos + 1)));
209                pos += 2;
210            } else {
211                // char
212                set.add(CharRange.is(str.charAt(pos)));
213                pos += 1;
214            }
215        }
216    }
217
218    //-----------------------------------------------------------------------
219    /**
220     * <p>Gets the internal set as an array of CharRange objects.</p>
221     *
222     * @return an array of immutable CharRange objects
223     * @since 2.0
224     */
225// NOTE: This is no longer public as CharRange is no longer a public class.
226//       It may be replaced when CharSet moves to Range.
227    /*public*/ CharRange[] getCharRanges() {
228        return set.toArray(new CharRange[set.size()]);
229    }
230
231    //-----------------------------------------------------------------------
232    /**
233     * <p>Does the {@code CharSet} contain the specified
234     * character {@code ch}.</p>
235     *
236     * @param ch  the character to check for
237     * @return {@code true} if the set contains the characters
238     */
239    public boolean contains(final char ch) {
240        for (final CharRange range : set) {
241            if (range.contains(ch)) {
242                return true;
243            }
244        }
245        return false;
246    }
247
248    // Basics
249    //-----------------------------------------------------------------------
250    /**
251     * <p>Compares two {@code CharSet} objects, returning true if they represent
252     * exactly the same set of characters defined in the same way.</p>
253     *
254     * <p>The two sets {@code abc} and {@code a-c} are <i>not</i>
255     * equal according to this method.</p>
256     *
257     * @param obj  the object to compare to
258     * @return true if equal
259     * @since 2.0
260     */
261    @Override
262    public boolean equals(final Object obj) {
263        if (obj == this) {
264            return true;
265        }
266        if (!(obj instanceof CharSet)) {
267            return false;
268        }
269        final CharSet other = (CharSet) obj;
270        return set.equals(other.set);
271    }
272
273    /**
274     * <p>Gets a hash code compatible with the equals method.</p>
275     *
276     * @return a suitable hash code
277     * @since 2.0
278     */
279    @Override
280    public int hashCode() {
281        return 89 + set.hashCode();
282    }
283
284    /**
285     * <p>Gets a string representation of the set.</p>
286     *
287     * @return string representation of the set
288     */
289    @Override
290    public String toString() {
291        return set.toString();
292    }
293
294}