001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3; 018 019import java.io.Serializable; 020import java.util.Collections; 021import java.util.HashMap; 022import java.util.HashSet; 023import java.util.Map; 024import java.util.Set; 025 026/** 027 * <p>A set of characters.</p> 028 * 029 * <p>Instances are immutable, but instances of subclasses may not be.</p> 030 * 031 * <p>#ThreadSafe#</p> 032 * @since 1.0 033 * @version $Id: CharSet.java 1436770 2013-01-22 07:09:45Z ggregory $ 034 */ 035public class CharSet implements Serializable { 036 037 /** 038 * Required for serialization support. Lang version 2.0. 039 * 040 * @see java.io.Serializable 041 */ 042 private static final long serialVersionUID = 5947847346149275958L; 043 044 /** 045 * A CharSet defining no characters. 046 * @since 2.0 047 */ 048 public static final CharSet EMPTY = new CharSet((String) null); 049 050 /** 051 * A CharSet defining ASCII alphabetic characters "a-zA-Z". 052 * @since 2.0 053 */ 054 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z"); 055 056 /** 057 * A CharSet defining ASCII alphabetic characters "a-z". 058 * @since 2.0 059 */ 060 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z"); 061 062 /** 063 * A CharSet defining ASCII alphabetic characters "A-Z". 064 * @since 2.0 065 */ 066 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z"); 067 068 /** 069 * A CharSet defining ASCII alphabetic characters "0-9". 070 * @since 2.0 071 */ 072 public static final CharSet ASCII_NUMERIC = new CharSet("0-9"); 073 074 /** 075 * A Map of the common cases used in the factory. 076 * Subclasses can add more common patterns if desired 077 * @since 2.0 078 */ 079 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<String, CharSet>()); 080 081 static { 082 COMMON.put(null, EMPTY); 083 COMMON.put("", EMPTY); 084 COMMON.put("a-zA-Z", ASCII_ALPHA); 085 COMMON.put("A-Za-z", ASCII_ALPHA); 086 COMMON.put("a-z", ASCII_ALPHA_LOWER); 087 COMMON.put("A-Z", ASCII_ALPHA_UPPER); 088 COMMON.put("0-9", ASCII_NUMERIC); 089 } 090 091 /** The set of CharRange objects. */ 092 private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<CharRange>()); 093 094 //----------------------------------------------------------------------- 095 /** 096 * <p>Factory method to create a new CharSet using a special syntax.</p> 097 * 098 * <ul> 099 * <li>{@code null} or empty string ("") 100 * - set containing no characters</li> 101 * <li>Single character, such as "a" 102 * - set containing just that character</li> 103 * <li>Multi character, such as "a-e" 104 * - set containing characters from one character to the other</li> 105 * <li>Negated, such as "^a" or "^a-e" 106 * - set containing all characters except those defined</li> 107 * <li>Combinations, such as "abe-g" 108 * - set containing all the characters from the individual sets</li> 109 * </ul> 110 * 111 * <p>The matching order is:</p> 112 * <ol> 113 * <li>Negated multi character range, such as "^a-e" 114 * <li>Ordinary multi character range, such as "a-e" 115 * <li>Negated single character, such as "^a" 116 * <li>Ordinary single character, such as "a" 117 * </ol> 118 * <p>Matching works left to right. Once a match is found the 119 * search starts again from the next character.</p> 120 * 121 * <p>If the same range is defined twice using the same syntax, only 122 * one range will be kept. 123 * Thus, "a-ca-c" creates only one range of "a-c".</p> 124 * 125 * <p>If the start and end of a range are in the wrong order, 126 * they are reversed. Thus "a-e" is the same as "e-a". 127 * As a result, "a-ee-a" would create only one range, 128 * as the "a-e" and "e-a" are the same.</p> 129 * 130 * <p>The set of characters represented is the union of the specified ranges.</p> 131 * 132 * <p>All CharSet objects returned by this method will be immutable.</p> 133 * 134 * @param setStrs Strings to merge into the set, may be null 135 * @return a CharSet instance 136 * @since 2.4 137 */ 138 public static CharSet getInstance(final String... setStrs) { 139 if (setStrs == null) { 140 return null; 141 } 142 if (setStrs.length == 1) { 143 final CharSet common = COMMON.get(setStrs[0]); 144 if (common != null) { 145 return common; 146 } 147 } 148 return new CharSet(setStrs); 149 } 150 151 //----------------------------------------------------------------------- 152 /** 153 * <p>Constructs a new CharSet using the set syntax. 154 * Each string is merged in with the set.</p> 155 * 156 * @param set Strings to merge into the initial set 157 * @throws NullPointerException if set is {@code null} 158 */ 159 protected CharSet(final String... set) { 160 super(); 161 final int sz = set.length; 162 for (int i = 0; i < sz; i++) { 163 add(set[i]); 164 } 165 } 166 167 //----------------------------------------------------------------------- 168 /** 169 * <p>Add a set definition string to the {@code CharSet}.</p> 170 * 171 * @param str set definition string 172 */ 173 protected void add(final String str) { 174 if (str == null) { 175 return; 176 } 177 178 final int len = str.length(); 179 int pos = 0; 180 while (pos < len) { 181 final int remainder = len - pos; 182 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { 183 // negated range 184 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3))); 185 pos += 4; 186 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { 187 // range 188 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2))); 189 pos += 3; 190 } else if (remainder >= 2 && str.charAt(pos) == '^') { 191 // negated char 192 set.add(CharRange.isNot(str.charAt(pos + 1))); 193 pos += 2; 194 } else { 195 // char 196 set.add(CharRange.is(str.charAt(pos))); 197 pos += 1; 198 } 199 } 200 } 201 202 //----------------------------------------------------------------------- 203 /** 204 * <p>Gets the internal set as an array of CharRange objects.</p> 205 * 206 * @return an array of immutable CharRange objects 207 * @since 2.0 208 */ 209// NOTE: This is no longer public as CharRange is no longer a public class. 210// It may be replaced when CharSet moves to Range. 211 /*public*/ CharRange[] getCharRanges() { 212 return set.toArray(new CharRange[set.size()]); 213 } 214 215 //----------------------------------------------------------------------- 216 /** 217 * <p>Does the {@code CharSet} contain the specified 218 * character {@code ch}.</p> 219 * 220 * @param ch the character to check for 221 * @return {@code true} if the set contains the characters 222 */ 223 public boolean contains(final char ch) { 224 for (final CharRange range : set) { 225 if (range.contains(ch)) { 226 return true; 227 } 228 } 229 return false; 230 } 231 232 // Basics 233 //----------------------------------------------------------------------- 234 /** 235 * <p>Compares two {@code CharSet} objects, returning true if they represent 236 * exactly the same set of characters defined in the same way.</p> 237 * 238 * <p>The two sets {@code abc} and {@code a-c} are <i>not</i> 239 * equal according to this method.</p> 240 * 241 * @param obj the object to compare to 242 * @return true if equal 243 * @since 2.0 244 */ 245 @Override 246 public boolean equals(final Object obj) { 247 if (obj == this) { 248 return true; 249 } 250 if (obj instanceof CharSet == false) { 251 return false; 252 } 253 final CharSet other = (CharSet) obj; 254 return set.equals(other.set); 255 } 256 257 /** 258 * <p>Gets a hash code compatible with the equals method.</p> 259 * 260 * @return a suitable hash code 261 * @since 2.0 262 */ 263 @Override 264 public int hashCode() { 265 return 89 + set.hashCode(); 266 } 267 268 /** 269 * <p>Gets a string representation of the set.</p> 270 * 271 * @return string representation of the set 272 */ 273 @Override 274 public String toString() { 275 return set.toString(); 276 } 277 278}