001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang3;
018
019 import java.io.Serializable;
020 import java.util.Collections;
021 import java.util.HashMap;
022 import java.util.HashSet;
023 import java.util.Map;
024 import java.util.Set;
025
026 /**
027 * <p>A set of characters.</p>
028 *
029 * <p>Instances are immutable, but instances of subclasses may not be.</p>
030 *
031 * <p>#ThreadSafe#</p>
032 * @since 1.0
033 * @version $Id: CharSet.java 1090427 2011-04-08 20:17:10Z bayard $
034 */
035 public class CharSet implements Serializable {
036
037 /**
038 * Required for serialization support. Lang version 2.0.
039 *
040 * @see java.io.Serializable
041 */
042 private static final long serialVersionUID = 5947847346149275958L;
043
044 /**
045 * A CharSet defining no characters.
046 * @since 2.0
047 */
048 public static final CharSet EMPTY = new CharSet((String) null);
049
050 /**
051 * A CharSet defining ASCII alphabetic characters "a-zA-Z".
052 * @since 2.0
053 */
054 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
055
056 /**
057 * A CharSet defining ASCII alphabetic characters "a-z".
058 * @since 2.0
059 */
060 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
061
062 /**
063 * A CharSet defining ASCII alphabetic characters "A-Z".
064 * @since 2.0
065 */
066 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
067
068 /**
069 * A CharSet defining ASCII alphabetic characters "0-9".
070 * @since 2.0
071 */
072 public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
073
074 /**
075 * A Map of the common cases used in the factory.
076 * Subclasses can add more common patterns if desired
077 * @since 2.0
078 */
079 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<String, CharSet>());
080
081 static {
082 COMMON.put(null, EMPTY);
083 COMMON.put("", EMPTY);
084 COMMON.put("a-zA-Z", ASCII_ALPHA);
085 COMMON.put("A-Za-z", ASCII_ALPHA);
086 COMMON.put("a-z", ASCII_ALPHA_LOWER);
087 COMMON.put("A-Z", ASCII_ALPHA_UPPER);
088 COMMON.put("0-9", ASCII_NUMERIC);
089 }
090
091 /** The set of CharRange objects. */
092 private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<CharRange>());
093
094 //-----------------------------------------------------------------------
095 /**
096 * <p>Factory method to create a new CharSet using a special syntax.</p>
097 *
098 * <ul>
099 * <li>{@code null} or empty string ("")
100 * - set containing no characters</li>
101 * <li>Single character, such as "a"
102 * - set containing just that character</li>
103 * <li>Multi character, such as "a-e"
104 * - set containing characters from one character to the other</li>
105 * <li>Negated, such as "^a" or "^a-e"
106 * - set containing all characters except those defined</li>
107 * <li>Combinations, such as "abe-g"
108 * - set containing all the characters from the individual sets</li>
109 * </ul>
110 *
111 * <p>The matching order is:</p>
112 * <ol>
113 * <li>Negated multi character range, such as "^a-e"
114 * <li>Ordinary multi character range, such as "a-e"
115 * <li>Negated single character, such as "^a"
116 * <li>Ordinary single character, such as "a"
117 * </ol>
118 * <p>Matching works left to right. Once a match is found the
119 * search starts again from the next character.</p>
120 *
121 * <p>If the same range is defined twice using the same syntax, only
122 * one range will be kept.
123 * Thus, "a-ca-c" creates only one range of "a-c".</p>
124 *
125 * <p>If the start and end of a range are in the wrong order,
126 * they are reversed. Thus "a-e" is the same as "e-a".
127 * As a result, "a-ee-a" would create only one range,
128 * as the "a-e" and "e-a" are the same.</p>
129 *
130 * <p>The set of characters represented is the union of the specified ranges.</p>
131 *
132 * <p>All CharSet objects returned by this method will be immutable.</p>
133 *
134 * @param setStrs Strings to merge into the set, may be null
135 * @return a CharSet instance
136 * @since 2.4
137 */
138 public static CharSet getInstance(String... setStrs) {
139 if (setStrs == null) {
140 return null;
141 }
142 if (setStrs.length == 1) {
143 CharSet common = COMMON.get(setStrs[0]);
144 if (common != null) {
145 return common;
146 }
147 }
148 return new CharSet(setStrs);
149 }
150
151 //-----------------------------------------------------------------------
152 /**
153 * <p>Constructs a new CharSet using the set syntax.
154 * Each string is merged in with the set.</p>
155 *
156 * @param set Strings to merge into the initial set
157 * @throws NullPointerException if set is {@code null}
158 */
159 protected CharSet(String... set) {
160 super();
161 int sz = set.length;
162 for (int i = 0; i < sz; i++) {
163 add(set[i]);
164 }
165 }
166
167 //-----------------------------------------------------------------------
168 /**
169 * <p>Add a set definition string to the {@code CharSet}.</p>
170 *
171 * @param str set definition string
172 */
173 protected void add(String str) {
174 if (str == null) {
175 return;
176 }
177
178 int len = str.length();
179 int pos = 0;
180 while (pos < len) {
181 int remainder = (len - pos);
182 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
183 // negated range
184 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
185 pos += 4;
186 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
187 // range
188 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
189 pos += 3;
190 } else if (remainder >= 2 && str.charAt(pos) == '^') {
191 // negated char
192 set.add(CharRange.isNot(str.charAt(pos + 1)));
193 pos += 2;
194 } else {
195 // char
196 set.add(CharRange.is(str.charAt(pos)));
197 pos += 1;
198 }
199 }
200 }
201
202 //-----------------------------------------------------------------------
203 /**
204 * <p>Gets the internal set as an array of CharRange objects.</p>
205 *
206 * @return an array of immutable CharRange objects
207 * @since 2.0
208 */
209 // NOTE: This is no longer public as CharRange is no longer a public class.
210 // It may be replaced when CharSet moves to Range.
211 /*public*/ CharRange[] getCharRanges() {
212 return set.toArray(new CharRange[set.size()]);
213 }
214
215 //-----------------------------------------------------------------------
216 /**
217 * <p>Does the {@code CharSet} contain the specified
218 * character {@code ch}.</p>
219 *
220 * @param ch the character to check for
221 * @return {@code true} if the set contains the characters
222 */
223 public boolean contains(char ch) {
224 for (CharRange range : set) {
225 if (range.contains(ch)) {
226 return true;
227 }
228 }
229 return false;
230 }
231
232 // Basics
233 //-----------------------------------------------------------------------
234 /**
235 * <p>Compares two {@code CharSet} objects, returning true if they represent
236 * exactly the same set of characters defined in the same way.</p>
237 *
238 * <p>The two sets {@code abc} and {@code a-c} are <i>not</i>
239 * equal according to this method.</p>
240 *
241 * @param obj the object to compare to
242 * @return true if equal
243 * @since 2.0
244 */
245 @Override
246 public boolean equals(Object obj) {
247 if (obj == this) {
248 return true;
249 }
250 if (obj instanceof CharSet == false) {
251 return false;
252 }
253 CharSet other = (CharSet) obj;
254 return set.equals(other.set);
255 }
256
257 /**
258 * <p>Gets a hash code compatible with the equals method.</p>
259 *
260 * @return a suitable hash code
261 * @since 2.0
262 */
263 @Override
264 public int hashCode() {
265 return 89 + set.hashCode();
266 }
267
268 /**
269 * <p>Gets a string representation of the set.</p>
270 *
271 * @return string representation of the set
272 */
273 @Override
274 public String toString() {
275 return set.toString();
276 }
277
278 }