001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3;
018
019import java.io.Serializable;
020import java.util.Collections;
021import java.util.HashMap;
022import java.util.LinkedHashSet;
023import java.util.Map;
024import java.util.Set;
025import java.util.stream.Stream;
026
027/**
028 * A set of characters.
029 *
030 * <p>Instances are immutable, but instances of subclasses may not be.</p>
031 *
032 * <p>#ThreadSafe#</p>
033 *
034 * @since 1.0
035 */
036public class CharSet implements Serializable {
037
038    /**
039     * Required for serialization support. Lang version 2.0.
040     *
041     * @see java.io.Serializable
042     */
043    private static final long serialVersionUID = 5947847346149275958L;
044
045    /**
046     * A CharSet defining no characters.
047     *
048     * @since 2.0
049     */
050    public static final CharSet EMPTY = new CharSet((String) null);
051
052    /**
053     * A CharSet defining ASCII alphabetic characters "a-zA-Z".
054     *
055     * @since 2.0
056     */
057    public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
058
059    /**
060     * A CharSet defining ASCII alphabetic characters "a-z".
061     *
062     * @since 2.0
063     */
064    public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
065
066    /**
067     * A CharSet defining ASCII alphabetic characters "A-Z".
068     *
069     * @since 2.0
070     */
071    public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
072
073    /**
074     * A CharSet defining ASCII alphabetic characters "0-9".
075     *
076     * @since 2.0
077     */
078    public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
079
080    /**
081     * A Map of the common cases used in the factory.
082     * <p>
083     * Subclasses can add more common patterns if desired.
084     * </p>
085     *
086     * @since 2.0
087     */
088    protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
089
090    static {
091        COMMON.put(null, EMPTY);
092        COMMON.put(StringUtils.EMPTY, EMPTY);
093        COMMON.put("a-zA-Z", ASCII_ALPHA);
094        COMMON.put("A-Za-z", ASCII_ALPHA);
095        COMMON.put("a-z", ASCII_ALPHA_LOWER);
096        COMMON.put("A-Z", ASCII_ALPHA_UPPER);
097        COMMON.put("0-9", ASCII_NUMERIC);
098    }
099
100    /**
101     * Creates a new CharSet using the syntax described below.
102     *
103     * <ul>
104     *  <li>{@code null} or empty string ("")
105     * - set containing no characters</li>
106     *  <li>Single character, such as "a"
107     *  - set containing just that character</li>
108     *  <li>Multi character, such as "a-e"
109     *  - set containing characters from one character to the other</li>
110     *  <li>Negated, such as "^a" or "^a-e"
111     *  - set containing all characters except those defined</li>
112     *  <li>Combinations, such as "abe-g"
113     *  - set containing all the characters from the individual sets</li>
114     * </ul>
115     *
116     * <p>The matching order is:</p>
117     * <ol>
118     *  <li>Negated multi character range, such as "^a-e"</li>
119     *  <li>Ordinary multi character range, such as "a-e"</li>
120     *  <li>Negated single character, such as "^a"</li>
121     *  <li>Ordinary single character, such as "a"</li>
122     * </ol>
123     *
124     * <p>Matching works left to right. Once a match is found the
125     * search starts again from the next character.</p>
126     *
127     * <p>If the same range is defined twice using the same syntax, only
128     * one range will be kept.
129     * Thus, "a-ca-c" creates only one range of "a-c".</p>
130     *
131     * <p>If the start and end of a range are in the wrong order,
132     * they are reversed. Thus "a-e" is the same as "e-a".
133     * As a result, "a-ee-a" would create only one range,
134     * as the "a-e" and "e-a" are the same.</p>
135     *
136     * <p>The set of characters represented is the union of the specified ranges.</p>
137     *
138     * <p>There are two ways to add a literal negation character ({@code ^}):</p>
139     * <ul>
140     *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
141     *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
142     * </ul>
143     *
144     * <p>Examples using the negation character:</p>
145     * <pre>
146     *     CharSet.getInstance("^a-c").contains('a') = false
147     *     CharSet.getInstance("^a-c").contains('d') = true
148     *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
149     *     CharSet.getInstance("^^a-c").contains('^') = false
150     *     CharSet.getInstance("^a-cd-f").contains('d') = true
151     *     CharSet.getInstance("a-c^").contains('^') = true
152     *     CharSet.getInstance("^", "a-c").contains('^') = true
153     * </pre>
154     *
155     * <p>All CharSet objects returned by this method will be immutable.</p>
156     *
157     * @param setStrs  Strings to merge into the set, may be null.
158     * @return a CharSet instance.
159     * @since 2.4
160     */
161    public static CharSet getInstance(final String... setStrs) {
162        if (setStrs == null) {
163            return EMPTY;
164        }
165        if (setStrs.length == 1) {
166            final CharSet common = COMMON.get(setStrs[0]);
167            if (common != null) {
168                return common;
169            }
170        }
171        return new CharSet(setStrs);
172    }
173
174    /** The set of CharRange objects. */
175    private final Set<CharRange> set = Collections.synchronizedSet(new LinkedHashSet<>());
176
177    /**
178     * Constructs a new CharSet using the set syntax.
179     * Each string is merged in with the set.
180     *
181     * @param set  Strings to merge into the initial set.
182     * @throws NullPointerException if set is {@code null}.
183     */
184    protected CharSet(final String... set) {
185        Stream.of(set).forEach(this::add);
186    }
187
188    /**
189     * Add a set definition string to the {@link CharSet}.
190     *
191     * @param str  set definition string
192     */
193    protected void add(final String str) {
194        if (str == null) {
195            return;
196        }
197        final int len = str.length();
198        int pos = 0;
199        while (pos < len) {
200            final int remainder = len - pos;
201            if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
202                // negated range
203                set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
204                pos += 4;
205            } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
206                // range
207                set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
208                pos += 3;
209            } else if (remainder >= 2 && str.charAt(pos) == '^') {
210                // negated char
211                set.add(CharRange.isNot(str.charAt(pos + 1)));
212                pos += 2;
213            } else {
214                // char
215                set.add(CharRange.is(str.charAt(pos)));
216                pos += 1;
217            }
218        }
219    }
220
221    /**
222     * Tests whether this {@link CharSet} contain the specified character {@code ch}.
223     * <p>
224     * Examples using the negation character:
225     * </p>
226     * <pre>
227     *     CharSet.getInstance("^a-c").contains('a') = false
228     *     CharSet.getInstance("^a-c").contains('d') = true
229     *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
230     *     CharSet.getInstance("^^a-c").contains('^') = false
231     *     CharSet.getInstance("^a-cd-f").contains('d') = true
232     *     CharSet.getInstance("a-c^").contains('^') = true
233     *     CharSet.getInstance("^", "a-c").contains('^') = true
234     * </pre>
235     *
236     * @param ch the character to check.
237     * @return {@code true} if the set contains the characters.
238     */
239    public boolean contains(final char ch) {
240        synchronized (set) {
241            return set.stream().anyMatch(range -> range.contains(ch));
242        }
243    }
244
245    /**
246     * Compares two {@link CharSet} objects, returning true if they represent
247     * exactly the same set of characters defined in the same way.
248     *
249     * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
250     * equal according to this method.</p>
251     *
252     * @param obj  the object to compare.
253     * @return true if equal.
254     * @since 2.0
255     */
256    @Override
257    public boolean equals(final Object obj) {
258        if (obj == this) {
259            return true;
260        }
261        if (!(obj instanceof CharSet)) {
262            return false;
263        }
264        final CharSet other = (CharSet) obj;
265        return set.equals(other.set);
266    }
267
268    /**
269     * Gets the set of character ranges.
270     * <p>
271     * Package private for testing.
272     * </p>
273     *
274     * @return the set of character ranges.
275     */
276    Set<CharRange> getCharRanges() {
277        return set;
278    }
279
280    /**
281     * Gets a hash code compatible with the equals method.
282     *
283     * @return a suitable hash code.
284     * @since 2.0
285     */
286    @Override
287    public int hashCode() {
288        return 89 + set.hashCode();
289    }
290
291    /**
292     * Gets a string representation of the set.
293     *
294     * @return string representation of the set.
295     */
296    @Override
297    public String toString() {
298        return set.toString();
299    }
300
301}