1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.lang3;
18
19 import java.io.Serializable;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.LinkedHashSet;
23 import java.util.Map;
24 import java.util.Set;
25 import java.util.stream.Stream;
26
27 /**
28 * A set of characters.
29 *
30 * <p>Instances are immutable, but instances of subclasses may not be.</p>
31 *
32 * <p>#ThreadSafe#</p>
33 *
34 * @since 1.0
35 */
36 public class CharSet implements Serializable {
37
38 /**
39 * Required for serialization support. Lang version 2.0.
40 *
41 * @see java.io.Serializable
42 */
43 private static final long serialVersionUID = 5947847346149275958L;
44
45 /**
46 * A CharSet defining no characters.
47 *
48 * @since 2.0
49 */
50 public static final CharSet EMPTY = new CharSet((String) null);
51
52 /**
53 * A CharSet defining ASCII alphabetic characters "a-zA-Z".
54 *
55 * @since 2.0
56 */
57 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
58
59 /**
60 * A CharSet defining ASCII alphabetic characters "a-z".
61 *
62 * @since 2.0
63 */
64 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
65
66 /**
67 * A CharSet defining ASCII alphabetic characters "A-Z".
68 *
69 * @since 2.0
70 */
71 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
72
73 /**
74 * A CharSet defining ASCII alphabetic characters "0-9".
75 *
76 * @since 2.0
77 */
78 public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
79
80 /**
81 * A Map of the common cases used in the factory.
82 * <p>
83 * Subclasses can add more common patterns if desired.
84 * </p>
85 *
86 * @since 2.0
87 */
88 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
89
90 static {
91 COMMON.put(null, EMPTY);
92 COMMON.put(StringUtils.EMPTY, EMPTY);
93 COMMON.put("a-zA-Z", ASCII_ALPHA);
94 COMMON.put("A-Za-z", ASCII_ALPHA);
95 COMMON.put("a-z", ASCII_ALPHA_LOWER);
96 COMMON.put("A-Z", ASCII_ALPHA_UPPER);
97 COMMON.put("0-9", ASCII_NUMERIC);
98 }
99
100 /**
101 * Creates a new CharSet using the syntax described below.
102 *
103 * <ul>
104 * <li>{@code null} or empty string ("")
105 * - set containing no characters</li>
106 * <li>Single character, such as "a"
107 * - set containing just that character</li>
108 * <li>Multi character, such as "a-e"
109 * - set containing characters from one character to the other</li>
110 * <li>Negated, such as "^a" or "^a-e"
111 * - set containing all characters except those defined</li>
112 * <li>Combinations, such as "abe-g"
113 * - set containing all the characters from the individual sets</li>
114 * </ul>
115 *
116 * <p>The matching order is:</p>
117 * <ol>
118 * <li>Negated multi character range, such as "^a-e"</li>
119 * <li>Ordinary multi character range, such as "a-e"</li>
120 * <li>Negated single character, such as "^a"</li>
121 * <li>Ordinary single character, such as "a"</li>
122 * </ol>
123 *
124 * <p>Matching works left to right. Once a match is found the
125 * search starts again from the next character.</p>
126 *
127 * <p>If the same range is defined twice using the same syntax, only
128 * one range will be kept.
129 * Thus, "a-ca-c" creates only one range of "a-c".</p>
130 *
131 * <p>If the start and end of a range are in the wrong order,
132 * they are reversed. Thus "a-e" is the same as "e-a".
133 * As a result, "a-ee-a" would create only one range,
134 * as the "a-e" and "e-a" are the same.</p>
135 *
136 * <p>The set of characters represented is the union of the specified ranges.</p>
137 *
138 * <p>There are two ways to add a literal negation character ({@code ^}):</p>
139 * <ul>
140 * <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
141 * <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
142 * </ul>
143 *
144 * <p>Examples using the negation character:</p>
145 * <pre>
146 * CharSet.getInstance("^a-c").contains('a') = false
147 * CharSet.getInstance("^a-c").contains('d') = true
148 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
149 * CharSet.getInstance("^^a-c").contains('^') = false
150 * CharSet.getInstance("^a-cd-f").contains('d') = true
151 * CharSet.getInstance("a-c^").contains('^') = true
152 * CharSet.getInstance("^", "a-c").contains('^') = true
153 * </pre>
154 *
155 * <p>All CharSet objects returned by this method will be immutable.</p>
156 *
157 * @param setStrs Strings to merge into the set, may be null.
158 * @return a CharSet instance.
159 * @since 2.4
160 */
161 public static CharSet getInstance(final String... setStrs) {
162 if (setStrs == null) {
163 return EMPTY;
164 }
165 if (setStrs.length == 1) {
166 final CharSet common = COMMON.get(setStrs[0]);
167 if (common != null) {
168 return common;
169 }
170 }
171 return new CharSet(setStrs);
172 }
173
174 /** The set of CharRange objects. */
175 private final Set<CharRange> set = Collections.synchronizedSet(new LinkedHashSet<>());
176
177 /**
178 * Constructs a new CharSet using the set syntax.
179 * Each string is merged in with the set.
180 *
181 * @param set Strings to merge into the initial set.
182 * @throws NullPointerException if set is {@code null}.
183 */
184 protected CharSet(final String... set) {
185 Stream.of(set).forEach(this::add);
186 }
187
188 /**
189 * Add a set definition string to the {@link CharSet}.
190 *
191 * @param str set definition string
192 */
193 protected void add(final String str) {
194 if (str == null) {
195 return;
196 }
197 final int len = str.length();
198 int pos = 0;
199 while (pos < len) {
200 final int remainder = len - pos;
201 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
202 // negated range
203 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
204 pos += 4;
205 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
206 // range
207 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
208 pos += 3;
209 } else if (remainder >= 2 && str.charAt(pos) == '^') {
210 // negated char
211 set.add(CharRange.isNot(str.charAt(pos + 1)));
212 pos += 2;
213 } else {
214 // char
215 set.add(CharRange.is(str.charAt(pos)));
216 pos += 1;
217 }
218 }
219 }
220
221 /**
222 * Tests whether this {@link CharSet} contain the specified character {@code ch}.
223 * <p>
224 * Examples using the negation character:
225 * </p>
226 * <pre>
227 * CharSet.getInstance("^a-c").contains('a') = false
228 * CharSet.getInstance("^a-c").contains('d') = true
229 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
230 * CharSet.getInstance("^^a-c").contains('^') = false
231 * CharSet.getInstance("^a-cd-f").contains('d') = true
232 * CharSet.getInstance("a-c^").contains('^') = true
233 * CharSet.getInstance("^", "a-c").contains('^') = true
234 * </pre>
235 *
236 * @param ch the character to check.
237 * @return {@code true} if the set contains the characters.
238 */
239 public boolean contains(final char ch) {
240 synchronized (set) {
241 return set.stream().anyMatch(range -> range.contains(ch));
242 }
243 }
244
245 /**
246 * Compares two {@link CharSet} objects, returning true if they represent
247 * exactly the same set of characters defined in the same way.
248 *
249 * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
250 * equal according to this method.</p>
251 *
252 * @param obj the object to compare.
253 * @return true if equal.
254 * @since 2.0
255 */
256 @Override
257 public boolean equals(final Object obj) {
258 if (obj == this) {
259 return true;
260 }
261 if (!(obj instanceof CharSet)) {
262 return false;
263 }
264 final CharSet other = (CharSet) obj;
265 return set.equals(other.set);
266 }
267
268 /**
269 * Gets the set of character ranges.
270 * <p>
271 * Package private for testing.
272 * </p>
273 *
274 * @return the set of character ranges.
275 */
276 Set<CharRange> getCharRanges() {
277 return set;
278 }
279
280 /**
281 * Gets a hash code compatible with the equals method.
282 *
283 * @return a suitable hash code.
284 * @since 2.0
285 */
286 @Override
287 public int hashCode() {
288 return 89 + set.hashCode();
289 }
290
291 /**
292 * Gets a string representation of the set.
293 *
294 * @return string representation of the set.
295 */
296 @Override
297 public String toString() {
298 return set.toString();
299 }
300
301 }