001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang; 018 019 import java.io.Serializable; 020 import java.util.Collections; 021 import java.util.HashMap; 022 import java.util.HashSet; 023 import java.util.Iterator; 024 import java.util.Map; 025 import java.util.Set; 026 027 /** 028 * <p>A set of characters.</p> 029 * 030 * <p>Instances are immutable, but instances of subclasses may not be.</p> 031 * 032 * @author Apache Software Foundation 033 * @author Phil Steitz 034 * @author Pete Gieser 035 * @author Gary Gregory 036 * @since 1.0 037 * @version $Id: CharSet.java 905671 2010-02-02 15:25:14Z niallp $ 038 */ 039 public class CharSet implements Serializable { 040 041 /** 042 * Required for serialization support. Lang version 2.0. 043 * 044 * @see java.io.Serializable 045 */ 046 private static final long serialVersionUID = 5947847346149275958L; 047 048 /** 049 * A CharSet defining no characters. 050 * @since 2.0 051 */ 052 public static final CharSet EMPTY = new CharSet((String) null); 053 054 /** 055 * A CharSet defining ASCII alphabetic characters "a-zA-Z". 056 * @since 2.0 057 */ 058 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z"); 059 060 /** 061 * A CharSet defining ASCII alphabetic characters "a-z". 062 * @since 2.0 063 */ 064 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z"); 065 066 /** 067 * A CharSet defining ASCII alphabetic characters "A-Z". 068 * @since 2.0 069 */ 070 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z"); 071 072 /** 073 * A CharSet defining ASCII alphabetic characters "0-9". 074 * @since 2.0 075 */ 076 public static final CharSet ASCII_NUMERIC = new CharSet("0-9"); 077 078 /** 079 * A Map of the common cases used in the factory. 080 * Subclasses can add more common patterns if desired 081 * @since 2.0 082 */ 083 protected static final Map COMMON = Collections.synchronizedMap(new HashMap()); 084 085 static { 086 COMMON.put(null, EMPTY); 087 COMMON.put("", EMPTY); 088 COMMON.put("a-zA-Z", ASCII_ALPHA); 089 COMMON.put("A-Za-z", ASCII_ALPHA); 090 COMMON.put("a-z", ASCII_ALPHA_LOWER); 091 COMMON.put("A-Z", ASCII_ALPHA_UPPER); 092 COMMON.put("0-9", ASCII_NUMERIC); 093 } 094 095 /** The set of CharRange objects. */ 096 private final Set set = new HashSet(); 097 098 //----------------------------------------------------------------------- 099 /** 100 * <p>Factory method to create a new CharSet using a special syntax.</p> 101 * 102 * <ul> 103 * <li><code>null</code> or empty string ("") 104 * - set containing no characters</li> 105 * <li>Single character, such as "a" 106 * - set containing just that character</li> 107 * <li>Multi character, such as "a-e" 108 * - set containing characters from one character to the other</li> 109 * <li>Negated, such as "^a" or "^a-e" 110 * - set containing all characters except those defined</li> 111 * <li>Combinations, such as "abe-g" 112 * - set containing all the characters from the individual sets</li> 113 * </ul> 114 * 115 * <p>The matching order is:</p> 116 * <ol> 117 * <li>Negated multi character range, such as "^a-e" 118 * <li>Ordinary multi character range, such as "a-e" 119 * <li>Negated single character, such as "^a" 120 * <li>Ordinary single character, such as "a" 121 * </ol> 122 * <p>Matching works left to right. Once a match is found the 123 * search starts again from the next character.</p> 124 * 125 * <p>If the same range is defined twice using the same syntax, only 126 * one range will be kept. 127 * Thus, "a-ca-c" creates only one range of "a-c".</p> 128 * 129 * <p>If the start and end of a range are in the wrong order, 130 * they are reversed. Thus "a-e" is the same as "e-a". 131 * As a result, "a-ee-a" would create only one range, 132 * as the "a-e" and "e-a" are the same.</p> 133 * 134 * <p>The set of characters represented is the union of the specified ranges.</p> 135 * 136 * <p>All CharSet objects returned by this method will be immutable.</p> 137 * 138 * @param setStr the String describing the set, may be null 139 * @return a CharSet instance 140 * @since 2.0 141 */ 142 public static CharSet getInstance(String setStr) { 143 Object set = COMMON.get(setStr); 144 if (set != null) { 145 return (CharSet) set; 146 } 147 return new CharSet(setStr); 148 } 149 150 /** 151 * <p>Constructs a new CharSet using the set syntax. 152 * Each string is merged in with the set.</p> 153 * 154 * @param setStrs Strings to merge into the initial set, may be null 155 * @return a CharSet instance 156 * @since 2.4 157 */ 158 public static CharSet getInstance(String[] setStrs) { 159 if (setStrs == null) { 160 return null; 161 } 162 return new CharSet(setStrs); 163 } 164 165 //----------------------------------------------------------------------- 166 /** 167 * <p>Constructs a new CharSet using the set syntax.</p> 168 * 169 * @param setStr the String describing the set, may be null 170 * @since 2.0 171 */ 172 protected CharSet(String setStr) { 173 super(); 174 add(setStr); 175 } 176 177 /** 178 * <p>Constructs a new CharSet using the set syntax. 179 * Each string is merged in with the set.</p> 180 * 181 * @param set Strings to merge into the initial set 182 * @throws NullPointerException if set is <code>null</code> 183 */ 184 protected CharSet(String[] set) { 185 super(); 186 int sz = set.length; 187 for (int i = 0; i < sz; i++) { 188 add(set[i]); 189 } 190 } 191 192 //----------------------------------------------------------------------- 193 /** 194 * <p>Add a set definition string to the <code>CharSet</code>.</p> 195 * 196 * @param str set definition string 197 */ 198 protected void add(String str) { 199 if (str == null) { 200 return; 201 } 202 203 int len = str.length(); 204 int pos = 0; 205 while (pos < len) { 206 int remainder = (len - pos); 207 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { 208 // negated range 209 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3))); 210 pos += 4; 211 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { 212 // range 213 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2))); 214 pos += 3; 215 } else if (remainder >= 2 && str.charAt(pos) == '^') { 216 // negated char 217 set.add(CharRange.isNot(str.charAt(pos + 1))); 218 pos += 2; 219 } else { 220 // char 221 set.add(CharRange.is(str.charAt(pos))); 222 pos += 1; 223 } 224 } 225 } 226 227 //----------------------------------------------------------------------- 228 /** 229 * <p>Gets the internal set as an array of CharRange objects.</p> 230 * 231 * @return an array of immutable CharRange objects 232 * @since 2.0 233 */ 234 public CharRange[] getCharRanges() { 235 return (CharRange[]) set.toArray(new CharRange[set.size()]); 236 } 237 238 //----------------------------------------------------------------------- 239 /** 240 * <p>Does the <code>CharSet</code> contain the specified 241 * character <code>ch</code>.</p> 242 * 243 * @param ch the character to check for 244 * @return <code>true</code> if the set contains the characters 245 */ 246 public boolean contains(char ch) { 247 for (Iterator it = set.iterator(); it.hasNext();) { 248 CharRange range = (CharRange) it.next(); 249 if (range.contains(ch)) { 250 return true; 251 } 252 } 253 return false; 254 } 255 256 // Basics 257 //----------------------------------------------------------------------- 258 /** 259 * <p>Compares two CharSet objects, returning true if they represent 260 * exactly the same set of characters defined in the same way.</p> 261 * 262 * <p>The two sets <code>abc</code> and <code>a-c</code> are <i>not</i> 263 * equal according to this method.</p> 264 * 265 * @param obj the object to compare to 266 * @return true if equal 267 * @since 2.0 268 */ 269 public boolean equals(Object obj) { 270 if (obj == this) { 271 return true; 272 } 273 if (obj instanceof CharSet == false) { 274 return false; 275 } 276 CharSet other = (CharSet) obj; 277 return set.equals(other.set); 278 } 279 280 /** 281 * <p>Gets a hashCode compatible with the equals method.</p> 282 * 283 * @return a suitable hashCode 284 * @since 2.0 285 */ 286 public int hashCode() { 287 return 89 + set.hashCode(); 288 } 289 290 /** 291 * <p>Gets a string representation of the set.</p> 292 * 293 * @return string representation of the set 294 */ 295 public String toString() { 296 return set.toString(); 297 } 298 299 }