001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3; 018 019import java.io.Serializable; 020import java.util.Collections; 021import java.util.HashMap; 022import java.util.LinkedHashSet; 023import java.util.Map; 024import java.util.Set; 025import java.util.stream.Stream; 026 027/** 028 * A set of characters. 029 * 030 * <p>Instances are immutable, but instances of subclasses may not be.</p> 031 * 032 * <p>#ThreadSafe#</p> 033 * 034 * @since 1.0 035 */ 036public class CharSet implements Serializable { 037 038 /** 039 * Required for serialization support. Lang version 2.0. 040 * 041 * @see java.io.Serializable 042 */ 043 private static final long serialVersionUID = 5947847346149275958L; 044 045 /** 046 * A CharSet defining no characters. 047 * 048 * @since 2.0 049 */ 050 public static final CharSet EMPTY = new CharSet((String) null); 051 052 /** 053 * A CharSet defining ASCII alphabetic characters "a-zA-Z". 054 * 055 * @since 2.0 056 */ 057 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z"); 058 059 /** 060 * A CharSet defining ASCII alphabetic characters "a-z". 061 * 062 * @since 2.0 063 */ 064 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z"); 065 066 /** 067 * A CharSet defining ASCII alphabetic characters "A-Z". 068 * 069 * @since 2.0 070 */ 071 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z"); 072 073 /** 074 * A CharSet defining ASCII alphabetic characters "0-9". 075 * 076 * @since 2.0 077 */ 078 public static final CharSet ASCII_NUMERIC = new CharSet("0-9"); 079 080 /** 081 * A Map of the common cases used in the factory. 082 * <p> 083 * Subclasses can add more common patterns if desired. 084 * </p> 085 * 086 * @since 2.0 087 */ 088 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>()); 089 090 static { 091 COMMON.put(null, EMPTY); 092 COMMON.put(StringUtils.EMPTY, EMPTY); 093 COMMON.put("a-zA-Z", ASCII_ALPHA); 094 COMMON.put("A-Za-z", ASCII_ALPHA); 095 COMMON.put("a-z", ASCII_ALPHA_LOWER); 096 COMMON.put("A-Z", ASCII_ALPHA_UPPER); 097 COMMON.put("0-9", ASCII_NUMERIC); 098 } 099 100 /** 101 * Creates a new CharSet using the syntax described below. 102 * 103 * <ul> 104 * <li>{@code null} or empty string ("") 105 * - set containing no characters</li> 106 * <li>Single character, such as "a" 107 * - set containing just that character</li> 108 * <li>Multi character, such as "a-e" 109 * - set containing characters from one character to the other</li> 110 * <li>Negated, such as "^a" or "^a-e" 111 * - set containing all characters except those defined</li> 112 * <li>Combinations, such as "abe-g" 113 * - set containing all the characters from the individual sets</li> 114 * </ul> 115 * 116 * <p>The matching order is:</p> 117 * <ol> 118 * <li>Negated multi character range, such as "^a-e"</li> 119 * <li>Ordinary multi character range, such as "a-e"</li> 120 * <li>Negated single character, such as "^a"</li> 121 * <li>Ordinary single character, such as "a"</li> 122 * </ol> 123 * 124 * <p>Matching works left to right. Once a match is found the 125 * search starts again from the next character.</p> 126 * 127 * <p>If the same range is defined twice using the same syntax, only 128 * one range will be kept. 129 * Thus, "a-ca-c" creates only one range of "a-c".</p> 130 * 131 * <p>If the start and end of a range are in the wrong order, 132 * they are reversed. Thus "a-e" is the same as "e-a". 133 * As a result, "a-ee-a" would create only one range, 134 * as the "a-e" and "e-a" are the same.</p> 135 * 136 * <p>The set of characters represented is the union of the specified ranges.</p> 137 * 138 * <p>There are two ways to add a literal negation character ({@code ^}):</p> 139 * <ul> 140 * <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li> 141 * <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li> 142 * </ul> 143 * 144 * <p>Examples using the negation character:</p> 145 * <pre> 146 * CharSet.getInstance("^a-c").contains('a') = false 147 * CharSet.getInstance("^a-c").contains('d') = true 148 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated) 149 * CharSet.getInstance("^^a-c").contains('^') = false 150 * CharSet.getInstance("^a-cd-f").contains('d') = true 151 * CharSet.getInstance("a-c^").contains('^') = true 152 * CharSet.getInstance("^", "a-c").contains('^') = true 153 * </pre> 154 * 155 * <p>All CharSet objects returned by this method will be immutable.</p> 156 * 157 * @param setStrs Strings to merge into the set, may be null. 158 * @return a CharSet instance. 159 * @since 2.4 160 */ 161 public static CharSet getInstance(final String... setStrs) { 162 if (setStrs == null) { 163 return EMPTY; 164 } 165 if (setStrs.length == 1) { 166 final CharSet common = COMMON.get(setStrs[0]); 167 if (common != null) { 168 return common; 169 } 170 } 171 return new CharSet(setStrs); 172 } 173 174 /** The set of CharRange objects. */ 175 private final Set<CharRange> set = Collections.synchronizedSet(new LinkedHashSet<>()); 176 177 /** 178 * Constructs a new CharSet using the set syntax. 179 * Each string is merged in with the set. 180 * 181 * @param set Strings to merge into the initial set. 182 * @throws NullPointerException if set is {@code null}. 183 */ 184 protected CharSet(final String... set) { 185 Stream.of(set).forEach(this::add); 186 } 187 188 /** 189 * Add a set definition string to the {@link CharSet}. 190 * 191 * @param str set definition string 192 */ 193 protected void add(final String str) { 194 if (str == null) { 195 return; 196 } 197 final int len = str.length(); 198 int pos = 0; 199 while (pos < len) { 200 final int remainder = len - pos; 201 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { 202 // negated range 203 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3))); 204 pos += 4; 205 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { 206 // range 207 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2))); 208 pos += 3; 209 } else if (remainder >= 2 && str.charAt(pos) == '^') { 210 // negated char 211 set.add(CharRange.isNot(str.charAt(pos + 1))); 212 pos += 2; 213 } else { 214 // char 215 set.add(CharRange.is(str.charAt(pos))); 216 pos += 1; 217 } 218 } 219 } 220 221 /** 222 * Tests whether this {@link CharSet} contain the specified character {@code ch}. 223 * <p> 224 * Examples using the negation character: 225 * </p> 226 * <pre> 227 * CharSet.getInstance("^a-c").contains('a') = false 228 * CharSet.getInstance("^a-c").contains('d') = true 229 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated) 230 * CharSet.getInstance("^^a-c").contains('^') = false 231 * CharSet.getInstance("^a-cd-f").contains('d') = true 232 * CharSet.getInstance("a-c^").contains('^') = true 233 * CharSet.getInstance("^", "a-c").contains('^') = true 234 * </pre> 235 * 236 * @param ch the character to check. 237 * @return {@code true} if the set contains the characters. 238 */ 239 public boolean contains(final char ch) { 240 synchronized (set) { 241 return set.stream().anyMatch(range -> range.contains(ch)); 242 } 243 } 244 245 /** 246 * Compares two {@link CharSet} objects, returning true if they represent 247 * exactly the same set of characters defined in the same way. 248 * 249 * <p>The two sets {@code abc} and {@code a-c} are <em>not</em> 250 * equal according to this method.</p> 251 * 252 * @param obj the object to compare. 253 * @return true if equal. 254 * @since 2.0 255 */ 256 @Override 257 public boolean equals(final Object obj) { 258 if (obj == this) { 259 return true; 260 } 261 if (!(obj instanceof CharSet)) { 262 return false; 263 } 264 final CharSet other = (CharSet) obj; 265 return set.equals(other.set); 266 } 267 268 /** 269 * Gets the set of character ranges. 270 * <p> 271 * Package private for testing. 272 * </p> 273 * 274 * @return the set of character ranges. 275 */ 276 Set<CharRange> getCharRanges() { 277 return set; 278 } 279 280 /** 281 * Gets a hash code compatible with the equals method. 282 * 283 * @return a suitable hash code. 284 * @since 2.0 285 */ 286 @Override 287 public int hashCode() { 288 return 89 + set.hashCode(); 289 } 290 291 /** 292 * Gets a string representation of the set. 293 * 294 * @return string representation of the set. 295 */ 296 @Override 297 public String toString() { 298 return set.toString(); 299 } 300 301}