1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.lang3;
18
19 import java.io.Serializable;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.Map;
24 import java.util.Set;
25 import java.util.stream.Stream;
26
27 /**
28 * A set of characters.
29 *
30 * <p>Instances are immutable, but instances of subclasses may not be.</p>
31 *
32 * <p>#ThreadSafe#</p>
33 * @since 1.0
34 */
35 public class CharSet implements Serializable {
36
37 /**
38 * Required for serialization support. Lang version 2.0.
39 *
40 * @see java.io.Serializable
41 */
42 private static final long serialVersionUID = 5947847346149275958L;
43
44 /**
45 * A CharSet defining no characters.
46 * @since 2.0
47 */
48 public static final CharSet EMPTY = new CharSet((String) null);
49
50 /**
51 * A CharSet defining ASCII alphabetic characters "a-zA-Z".
52 * @since 2.0
53 */
54 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
55
56 /**
57 * A CharSet defining ASCII alphabetic characters "a-z".
58 * @since 2.0
59 */
60 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
61
62 /**
63 * A CharSet defining ASCII alphabetic characters "A-Z".
64 * @since 2.0
65 */
66 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
67
68 /**
69 * A CharSet defining ASCII alphabetic characters "0-9".
70 * @since 2.0
71 */
72 public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
73
74 /**
75 * A Map of the common cases used in the factory.
76 * Subclasses can add more common patterns if desired
77 * @since 2.0
78 */
79 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
80
81 static {
82 COMMON.put(null, EMPTY);
83 COMMON.put(StringUtils.EMPTY, EMPTY);
84 COMMON.put("a-zA-Z", ASCII_ALPHA);
85 COMMON.put("A-Za-z", ASCII_ALPHA);
86 COMMON.put("a-z", ASCII_ALPHA_LOWER);
87 COMMON.put("A-Z", ASCII_ALPHA_UPPER);
88 COMMON.put("0-9", ASCII_NUMERIC);
89 }
90
91 /**
92 * Factory method to create a new CharSet using a special syntax.
93 *
94 * <ul>
95 * <li>{@code null} or empty string ("")
96 * - set containing no characters</li>
97 * <li>Single character, such as "a"
98 * - set containing just that character</li>
99 * <li>Multi character, such as "a-e"
100 * - set containing characters from one character to the other</li>
101 * <li>Negated, such as "^a" or "^a-e"
102 * - set containing all characters except those defined</li>
103 * <li>Combinations, such as "abe-g"
104 * - set containing all the characters from the individual sets</li>
105 * </ul>
106 *
107 * <p>The matching order is:</p>
108 * <ol>
109 * <li>Negated multi character range, such as "^a-e"
110 * <li>Ordinary multi character range, such as "a-e"
111 * <li>Negated single character, such as "^a"
112 * <li>Ordinary single character, such as "a"
113 * </ol>
114 *
115 * <p>Matching works left to right. Once a match is found the
116 * search starts again from the next character.</p>
117 *
118 * <p>If the same range is defined twice using the same syntax, only
119 * one range will be kept.
120 * Thus, "a-ca-c" creates only one range of "a-c".</p>
121 *
122 * <p>If the start and end of a range are in the wrong order,
123 * they are reversed. Thus "a-e" is the same as "e-a".
124 * As a result, "a-ee-a" would create only one range,
125 * as the "a-e" and "e-a" are the same.</p>
126 *
127 * <p>The set of characters represented is the union of the specified ranges.</p>
128 *
129 * <p>There are two ways to add a literal negation character ({@code ^}):</p>
130 * <ul>
131 * <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
132 * <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
133 * </ul>
134 *
135 * <p>Examples using the negation character:</p>
136 * <pre>
137 * CharSet.getInstance("^a-c").contains('a') = false
138 * CharSet.getInstance("^a-c").contains('d') = true
139 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
140 * CharSet.getInstance("^^a-c").contains('^') = false
141 * CharSet.getInstance("^a-cd-f").contains('d') = true
142 * CharSet.getInstance("a-c^").contains('^') = true
143 * CharSet.getInstance("^", "a-c").contains('^') = true
144 * </pre>
145 *
146 * <p>All CharSet objects returned by this method will be immutable.</p>
147 *
148 * @param setStrs Strings to merge into the set, may be null
149 * @return a CharSet instance
150 * @since 2.4
151 */
152 public static CharSet getInstance(final String... setStrs) {
153 if (setStrs == null) {
154 return null;
155 }
156 if (setStrs.length == 1) {
157 final CharSet common = COMMON.get(setStrs[0]);
158 if (common != null) {
159 return common;
160 }
161 }
162 return new CharSet(setStrs);
163 }
164
165 /** The set of CharRange objects. */
166 private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>());
167
168 /**
169 * Constructs a new CharSet using the set syntax.
170 * Each string is merged in with the set.
171 *
172 * @param set Strings to merge into the initial set
173 * @throws NullPointerException if set is {@code null}
174 */
175 protected CharSet(final String... set) {
176 Stream.of(set).forEach(this::add);
177 }
178
179 /**
180 * Add a set definition string to the {@link CharSet}.
181 *
182 * @param str set definition string
183 */
184 protected void add(final String str) {
185 if (str == null) {
186 return;
187 }
188
189 final int len = str.length();
190 int pos = 0;
191 while (pos < len) {
192 final int remainder = len - pos;
193 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
194 // negated range
195 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
196 pos += 4;
197 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
198 // range
199 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
200 pos += 3;
201 } else if (remainder >= 2 && str.charAt(pos) == '^') {
202 // negated char
203 set.add(CharRange.isNot(str.charAt(pos + 1)));
204 pos += 2;
205 } else {
206 // char
207 set.add(CharRange.is(str.charAt(pos)));
208 pos += 1;
209 }
210 }
211 }
212
213 /**
214 * Does the {@link CharSet} contain the specified
215 * character {@code ch}.
216 *
217 * @param ch the character to check for
218 * @return {@code true} if the set contains the characters
219 */
220 public boolean contains(final char ch) {
221 synchronized (set) {
222 return set.stream().anyMatch(range -> range.contains(ch));
223 }
224 }
225
226 // Basics
227 /**
228 * Compares two {@link CharSet} objects, returning true if they represent
229 * exactly the same set of characters defined in the same way.
230 *
231 * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
232 * equal according to this method.</p>
233 *
234 * @param obj the object to compare to
235 * @return true if equal
236 * @since 2.0
237 */
238 @Override
239 public boolean equals(final Object obj) {
240 if (obj == this) {
241 return true;
242 }
243 if (!(obj instanceof CharSet)) {
244 return false;
245 }
246 final CharSet other = (CharSet) obj;
247 return set.equals(other.set);
248 }
249
250 /**
251 * Gets the internal set as an array of CharRange objects.
252 *
253 * @return an array of immutable CharRange objects
254 * @since 2.0
255 */
256 // NOTE: This is no longer public as CharRange is no longer a public class.
257 // It may be replaced when CharSet moves to Range.
258 /*public*/ CharRange[] getCharRanges() {
259 return set.toArray(CharRange.EMPTY_ARRAY);
260 }
261
262 /**
263 * Gets a hash code compatible with the equals method.
264 *
265 * @return a suitable hash code
266 * @since 2.0
267 */
268 @Override
269 public int hashCode() {
270 return 89 + set.hashCode();
271 }
272
273 /**
274 * Gets a string representation of the set.
275 *
276 * @return string representation of the set
277 */
278 @Override
279 public String toString() {
280 return set.toString();
281 }
282
283 }