001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3; 018 019import java.io.Serializable; 020import java.util.Collections; 021import java.util.HashMap; 022import java.util.HashSet; 023import java.util.Map; 024import java.util.Set; 025 026/** 027 * <p>A set of characters.</p> 028 * 029 * <p>Instances are immutable, but instances of subclasses may not be.</p> 030 * 031 * <p>#ThreadSafe#</p> 032 * @since 1.0 033 */ 034public class CharSet implements Serializable { 035 036 /** 037 * Required for serialization support. Lang version 2.0. 038 * 039 * @see java.io.Serializable 040 */ 041 private static final long serialVersionUID = 5947847346149275958L; 042 043 /** 044 * A CharSet defining no characters. 045 * @since 2.0 046 */ 047 public static final CharSet EMPTY = new CharSet((String) null); 048 049 /** 050 * A CharSet defining ASCII alphabetic characters "a-zA-Z". 051 * @since 2.0 052 */ 053 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z"); 054 055 /** 056 * A CharSet defining ASCII alphabetic characters "a-z". 057 * @since 2.0 058 */ 059 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z"); 060 061 /** 062 * A CharSet defining ASCII alphabetic characters "A-Z". 063 * @since 2.0 064 */ 065 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z"); 066 067 /** 068 * A CharSet defining ASCII alphabetic characters "0-9". 069 * @since 2.0 070 */ 071 public static final CharSet ASCII_NUMERIC = new CharSet("0-9"); 072 073 /** 074 * A Map of the common cases used in the factory. 075 * Subclasses can add more common patterns if desired 076 * @since 2.0 077 */ 078 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>()); 079 080 static { 081 COMMON.put(null, EMPTY); 082 COMMON.put(StringUtils.EMPTY, EMPTY); 083 COMMON.put("a-zA-Z", ASCII_ALPHA); 084 COMMON.put("A-Za-z", ASCII_ALPHA); 085 COMMON.put("a-z", ASCII_ALPHA_LOWER); 086 COMMON.put("A-Z", ASCII_ALPHA_UPPER); 087 COMMON.put("0-9", ASCII_NUMERIC); 088 } 089 090 /** The set of CharRange objects. */ 091 private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>()); 092 093 //----------------------------------------------------------------------- 094 /** 095 * <p>Factory method to create a new CharSet using a special syntax.</p> 096 * 097 * <ul> 098 * <li>{@code null} or empty string ("") 099 * - set containing no characters</li> 100 * <li>Single character, such as "a" 101 * - set containing just that character</li> 102 * <li>Multi character, such as "a-e" 103 * - set containing characters from one character to the other</li> 104 * <li>Negated, such as "^a" or "^a-e" 105 * - set containing all characters except those defined</li> 106 * <li>Combinations, such as "abe-g" 107 * - set containing all the characters from the individual sets</li> 108 * </ul> 109 * 110 * <p>The matching order is:</p> 111 * <ol> 112 * <li>Negated multi character range, such as "^a-e" 113 * <li>Ordinary multi character range, such as "a-e" 114 * <li>Negated single character, such as "^a" 115 * <li>Ordinary single character, such as "a" 116 * </ol> 117 * 118 * <p>Matching works left to right. Once a match is found the 119 * search starts again from the next character.</p> 120 * 121 * <p>If the same range is defined twice using the same syntax, only 122 * one range will be kept. 123 * Thus, "a-ca-c" creates only one range of "a-c".</p> 124 * 125 * <p>If the start and end of a range are in the wrong order, 126 * they are reversed. Thus "a-e" is the same as "e-a". 127 * As a result, "a-ee-a" would create only one range, 128 * as the "a-e" and "e-a" are the same.</p> 129 * 130 * <p>The set of characters represented is the union of the specified ranges.</p> 131 * 132 * <p>There are two ways to add a literal negation character ({@code ^}):</p> 133 * <ul> 134 * <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li> 135 * <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li> 136 * </ul> 137 * 138 * <p>Examples using the negation character:</p> 139 * <pre> 140 * CharSet.getInstance("^a-c").contains('a') = false 141 * CharSet.getInstance("^a-c").contains('d') = true 142 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated) 143 * CharSet.getInstance("^^a-c").contains('^') = false 144 * CharSet.getInstance("^a-cd-f").contains('d') = true 145 * CharSet.getInstance("a-c^").contains('^') = true 146 * CharSet.getInstance("^", "a-c").contains('^') = true 147 * </pre> 148 * 149 * <p>All CharSet objects returned by this method will be immutable.</p> 150 * 151 * @param setStrs Strings to merge into the set, may be null 152 * @return a CharSet instance 153 * @since 2.4 154 */ 155 public static CharSet getInstance(final String... setStrs) { 156 if (setStrs == null) { 157 return null; 158 } 159 if (setStrs.length == 1) { 160 final CharSet common = COMMON.get(setStrs[0]); 161 if (common != null) { 162 return common; 163 } 164 } 165 return new CharSet(setStrs); 166 } 167 168 //----------------------------------------------------------------------- 169 /** 170 * <p>Constructs a new CharSet using the set syntax. 171 * Each string is merged in with the set.</p> 172 * 173 * @param set Strings to merge into the initial set 174 * @throws NullPointerException if set is {@code null} 175 */ 176 protected CharSet(final String... set) { 177 super(); 178 for (final String s : set) { 179 add(s); 180 } 181 } 182 183 //----------------------------------------------------------------------- 184 /** 185 * <p>Add a set definition string to the {@code CharSet}.</p> 186 * 187 * @param str set definition string 188 */ 189 protected void add(final String str) { 190 if (str == null) { 191 return; 192 } 193 194 final int len = str.length(); 195 int pos = 0; 196 while (pos < len) { 197 final int remainder = len - pos; 198 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { 199 // negated range 200 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3))); 201 pos += 4; 202 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { 203 // range 204 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2))); 205 pos += 3; 206 } else if (remainder >= 2 && str.charAt(pos) == '^') { 207 // negated char 208 set.add(CharRange.isNot(str.charAt(pos + 1))); 209 pos += 2; 210 } else { 211 // char 212 set.add(CharRange.is(str.charAt(pos))); 213 pos += 1; 214 } 215 } 216 } 217 218 //----------------------------------------------------------------------- 219 /** 220 * <p>Gets the internal set as an array of CharRange objects.</p> 221 * 222 * @return an array of immutable CharRange objects 223 * @since 2.0 224 */ 225// NOTE: This is no longer public as CharRange is no longer a public class. 226// It may be replaced when CharSet moves to Range. 227 /*public*/ CharRange[] getCharRanges() { 228 return set.toArray(new CharRange[0]); 229 } 230 231 //----------------------------------------------------------------------- 232 /** 233 * <p>Does the {@code CharSet} contain the specified 234 * character {@code ch}.</p> 235 * 236 * @param ch the character to check for 237 * @return {@code true} if the set contains the characters 238 */ 239 public boolean contains(final char ch) { 240 synchronized(set) { 241 for (final CharRange range : set) { 242 if (range.contains(ch)) { 243 return true; 244 } 245 } 246 } 247 return false; 248 } 249 250 // Basics 251 //----------------------------------------------------------------------- 252 /** 253 * <p>Compares two {@code CharSet} objects, returning true if they represent 254 * exactly the same set of characters defined in the same way.</p> 255 * 256 * <p>The two sets {@code abc} and {@code a-c} are <i>not</i> 257 * equal according to this method.</p> 258 * 259 * @param obj the object to compare to 260 * @return true if equal 261 * @since 2.0 262 */ 263 @Override 264 public boolean equals(final Object obj) { 265 if (obj == this) { 266 return true; 267 } 268 if (!(obj instanceof CharSet)) { 269 return false; 270 } 271 final CharSet other = (CharSet) obj; 272 return set.equals(other.set); 273 } 274 275 /** 276 * <p>Gets a hash code compatible with the equals method.</p> 277 * 278 * @return a suitable hash code 279 * @since 2.0 280 */ 281 @Override 282 public int hashCode() { 283 return 89 + set.hashCode(); 284 } 285 286 /** 287 * <p>Gets a string representation of the set.</p> 288 * 289 * @return string representation of the set 290 */ 291 @Override 292 public String toString() { 293 return set.toString(); 294 } 295 296}