View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3;
18  
19  import java.io.Serializable;
20  import java.util.Collections;
21  import java.util.HashMap;
22  import java.util.LinkedHashSet;
23  import java.util.Map;
24  import java.util.Set;
25  import java.util.stream.Stream;
26  
27  /**
28   * A set of characters.
29   *
30   * <p>Instances are immutable, but instances of subclasses may not be.</p>
31   *
32   * <p>#ThreadSafe#</p>
33   *
34   * @since 1.0
35   */
36  public class CharSet implements Serializable {
37  
38      /**
39       * Required for serialization support. Lang version 2.0.
40       *
41       * @see java.io.Serializable
42       */
43      private static final long serialVersionUID = 5947847346149275958L;
44  
45      /**
46       * A CharSet defining no characters.
47       *
48       * @since 2.0
49       */
50      public static final CharSet EMPTY = new CharSet((String) null);
51  
52      /**
53       * A CharSet defining ASCII alphabetic characters "a-zA-Z".
54       *
55       * @since 2.0
56       */
57      public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
58  
59      /**
60       * A CharSet defining ASCII alphabetic characters "a-z".
61       *
62       * @since 2.0
63       */
64      public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
65  
66      /**
67       * A CharSet defining ASCII alphabetic characters "A-Z".
68       *
69       * @since 2.0
70       */
71      public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
72  
73      /**
74       * A CharSet defining ASCII alphabetic characters "0-9".
75       *
76       * @since 2.0
77       */
78      public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
79  
80      /**
81       * A Map of the common cases used in the factory.
82       * <p>
83       * Subclasses can add more common patterns if desired.
84       * </p>
85       *
86       * @since 2.0
87       */
88      protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
89  
90      static {
91          COMMON.put(null, EMPTY);
92          COMMON.put(StringUtils.EMPTY, EMPTY);
93          COMMON.put("a-zA-Z", ASCII_ALPHA);
94          COMMON.put("A-Za-z", ASCII_ALPHA);
95          COMMON.put("a-z", ASCII_ALPHA_LOWER);
96          COMMON.put("A-Z", ASCII_ALPHA_UPPER);
97          COMMON.put("0-9", ASCII_NUMERIC);
98      }
99  
100     /**
101      * Creates a new CharSet using the syntax described below.
102      *
103      * <ul>
104      *  <li>{@code null} or empty string ("")
105      * - set containing no characters</li>
106      *  <li>Single character, such as "a"
107      *  - set containing just that character</li>
108      *  <li>Multi character, such as "a-e"
109      *  - set containing characters from one character to the other</li>
110      *  <li>Negated, such as "^a" or "^a-e"
111      *  - set containing all characters except those defined</li>
112      *  <li>Combinations, such as "abe-g"
113      *  - set containing all the characters from the individual sets</li>
114      * </ul>
115      *
116      * <p>The matching order is:</p>
117      * <ol>
118      *  <li>Negated multi character range, such as "^a-e"</li>
119      *  <li>Ordinary multi character range, such as "a-e"</li>
120      *  <li>Negated single character, such as "^a"</li>
121      *  <li>Ordinary single character, such as "a"</li>
122      * </ol>
123      *
124      * <p>Matching works left to right. Once a match is found the
125      * search starts again from the next character.</p>
126      *
127      * <p>If the same range is defined twice using the same syntax, only
128      * one range will be kept.
129      * Thus, "a-ca-c" creates only one range of "a-c".</p>
130      *
131      * <p>If the start and end of a range are in the wrong order,
132      * they are reversed. Thus "a-e" is the same as "e-a".
133      * As a result, "a-ee-a" would create only one range,
134      * as the "a-e" and "e-a" are the same.</p>
135      *
136      * <p>The set of characters represented is the union of the specified ranges.</p>
137      *
138      * <p>There are two ways to add a literal negation character ({@code ^}):</p>
139      * <ul>
140      *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
141      *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
142      * </ul>
143      *
144      * <p>Examples using the negation character:</p>
145      * <pre>
146      *     CharSet.getInstance("^a-c").contains('a') = false
147      *     CharSet.getInstance("^a-c").contains('d') = true
148      *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
149      *     CharSet.getInstance("^^a-c").contains('^') = false
150      *     CharSet.getInstance("^a-cd-f").contains('d') = true
151      *     CharSet.getInstance("a-c^").contains('^') = true
152      *     CharSet.getInstance("^", "a-c").contains('^') = true
153      * </pre>
154      *
155      * <p>All CharSet objects returned by this method will be immutable.</p>
156      *
157      * @param setStrs  Strings to merge into the set, may be null.
158      * @return a CharSet instance.
159      * @since 2.4
160      */
161     public static CharSet getInstance(final String... setStrs) {
162         if (setStrs == null) {
163             return EMPTY;
164         }
165         if (setStrs.length == 1) {
166             final CharSet common = COMMON.get(setStrs[0]);
167             if (common != null) {
168                 return common;
169             }
170         }
171         return new CharSet(setStrs);
172     }
173 
174     /** The set of CharRange objects. */
175     private final Set<CharRange> set = Collections.synchronizedSet(new LinkedHashSet<>());
176 
177     /**
178      * Constructs a new CharSet using the set syntax.
179      * Each string is merged in with the set.
180      *
181      * @param set  Strings to merge into the initial set.
182      * @throws NullPointerException if set is {@code null}.
183      */
184     protected CharSet(final String... set) {
185         Stream.of(set).forEach(this::add);
186     }
187 
188     /**
189      * Add a set definition string to the {@link CharSet}.
190      *
191      * @param str  set definition string
192      */
193     protected void add(final String str) {
194         if (str == null) {
195             return;
196         }
197         final int len = str.length();
198         int pos = 0;
199         while (pos < len) {
200             final int remainder = len - pos;
201             if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
202                 // negated range
203                 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
204                 pos += 4;
205             } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
206                 // range
207                 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
208                 pos += 3;
209             } else if (remainder >= 2 && str.charAt(pos) == '^') {
210                 // negated char
211                 set.add(CharRange.isNot(str.charAt(pos + 1)));
212                 pos += 2;
213             } else {
214                 // char
215                 set.add(CharRange.is(str.charAt(pos)));
216                 pos += 1;
217             }
218         }
219     }
220 
221     /**
222      * Tests whether this {@link CharSet} contain the specified character {@code ch}.
223      * <p>
224      * Examples using the negation character:
225      * </p>
226      * <pre>
227      *     CharSet.getInstance("^a-c").contains('a') = false
228      *     CharSet.getInstance("^a-c").contains('d') = true
229      *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
230      *     CharSet.getInstance("^^a-c").contains('^') = false
231      *     CharSet.getInstance("^a-cd-f").contains('d') = true
232      *     CharSet.getInstance("a-c^").contains('^') = true
233      *     CharSet.getInstance("^", "a-c").contains('^') = true
234      * </pre>
235      *
236      * @param ch the character to check.
237      * @return {@code true} if the set contains the characters.
238      */
239     public boolean contains(final char ch) {
240         synchronized (set) {
241             return set.stream().anyMatch(range -> range.contains(ch));
242         }
243     }
244 
245     /**
246      * Compares two {@link CharSet} objects, returning true if they represent
247      * exactly the same set of characters defined in the same way.
248      *
249      * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
250      * equal according to this method.</p>
251      *
252      * @param obj  the object to compare.
253      * @return true if equal.
254      * @since 2.0
255      */
256     @Override
257     public boolean equals(final Object obj) {
258         if (obj == this) {
259             return true;
260         }
261         if (!(obj instanceof CharSet)) {
262             return false;
263         }
264         final CharSet other = (CharSet) obj;
265         return set.equals(other.set);
266     }
267 
268     /**
269      * Gets the set of character ranges.
270      * <p>
271      * Package private for testing.
272      * </p>
273      *
274      * @return the set of character ranges.
275      */
276     Set<CharRange> getCharRanges() {
277         return set;
278     }
279 
280     /**
281      * Gets a hash code compatible with the equals method.
282      *
283      * @return a suitable hash code.
284      * @since 2.0
285      */
286     @Override
287     public int hashCode() {
288         return 89 + set.hashCode();
289     }
290 
291     /**
292      * Gets a string representation of the set.
293      *
294      * @return string representation of the set.
295      */
296     @Override
297     public String toString() {
298         return set.toString();
299     }
300 
301 }