001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang;
018    
019    import java.io.Serializable;
020    import java.util.Collections;
021    import java.util.HashMap;
022    import java.util.HashSet;
023    import java.util.Iterator;
024    import java.util.Map;
025    import java.util.Set;
026    
027    /**
028     * <p>A set of characters.</p>
029     *
030     * <p>Instances are immutable, but instances of subclasses may not be.</p>
031     *
032     * <p>#ThreadSafe#</p>
033     * @author Apache Software Foundation
034     * @author Phil Steitz
035     * @author Pete Gieser
036     * @author Gary Gregory
037     * @since 1.0
038     * @version $Id: CharSet.java 1056988 2011-01-09 17:58:53Z niallp $
039     */
040    public class CharSet implements Serializable {
041    
042        /**
043         * Required for serialization support. Lang version 2.0. 
044         * 
045         * @see java.io.Serializable
046         */
047        private static final long serialVersionUID = 5947847346149275958L;
048    
049        /** 
050         * A CharSet defining no characters. 
051         * @since 2.0
052         */
053        public static final CharSet EMPTY = new CharSet((String) null);
054    
055        /** 
056         * A CharSet defining ASCII alphabetic characters "a-zA-Z".
057         * @since 2.0
058         */
059        public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
060    
061        /** 
062         * A CharSet defining ASCII alphabetic characters "a-z".
063         * @since 2.0
064         */
065        public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
066    
067        /** 
068         * A CharSet defining ASCII alphabetic characters "A-Z".
069         * @since 2.0
070         */
071        public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
072    
073        /** 
074         * A CharSet defining ASCII alphabetic characters "0-9".
075         * @since 2.0
076         */
077        public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
078    
079        /**
080         * A Map of the common cases used in the factory.
081         * Subclasses can add more common patterns if desired
082         * @since 2.0
083         */
084        protected static final Map COMMON = Collections.synchronizedMap(new HashMap());
085        
086        static {
087            COMMON.put(null, EMPTY);
088            COMMON.put("", EMPTY);
089            COMMON.put("a-zA-Z", ASCII_ALPHA);
090            COMMON.put("A-Za-z", ASCII_ALPHA);
091            COMMON.put("a-z", ASCII_ALPHA_LOWER);
092            COMMON.put("A-Z", ASCII_ALPHA_UPPER);
093            COMMON.put("0-9", ASCII_NUMERIC);
094        }
095    
096        /** The set of CharRange objects. */
097        private final Set set = Collections.synchronizedSet(new HashSet());
098    
099        //-----------------------------------------------------------------------
100        /**
101         * <p>Factory method to create a new CharSet using a special syntax.</p>
102         *
103         * <ul>
104         *  <li><code>null</code> or empty string ("")
105         * - set containing no characters</li>
106         *  <li>Single character, such as "a"
107         *  - set containing just that character</li>
108         *  <li>Multi character, such as "a-e"
109         *  - set containing characters from one character to the other</li>
110         *  <li>Negated, such as "^a" or "^a-e"
111         *  - set containing all characters except those defined</li>
112         *  <li>Combinations, such as "abe-g"
113         *  - set containing all the characters from the individual sets</li>
114         * </ul>
115         *
116         * <p>The matching order is:</p>
117         * <ol>
118         *  <li>Negated multi character range, such as "^a-e"
119         *  <li>Ordinary multi character range, such as "a-e"
120         *  <li>Negated single character, such as "^a"
121         *  <li>Ordinary single character, such as "a"
122         * </ol>
123         * <p>Matching works left to right. Once a match is found the
124         * search starts again from the next character.</p>
125         *
126         * <p>If the same range is defined twice using the same syntax, only
127         * one range will be kept.
128         * Thus, "a-ca-c" creates only one range of "a-c".</p>
129         *
130         * <p>If the start and end of a range are in the wrong order,
131         * they are reversed. Thus "a-e" is the same as "e-a".
132         * As a result, "a-ee-a" would create only one range,
133         * as the "a-e" and "e-a" are the same.</p>
134         *
135         * <p>The set of characters represented is the union of the specified ranges.</p>
136         *
137         * <p>All CharSet objects returned by this method will be immutable.</p>
138         *
139         * @param setStr  the String describing the set, may be null
140         * @return a CharSet instance
141         * @since 2.0
142         */
143        public static CharSet getInstance(String setStr) {
144            Object set = COMMON.get(setStr);
145            if (set != null) {
146                return (CharSet) set;
147            }
148            return new CharSet(setStr);
149        }
150    
151        /**
152         * <p>Constructs a new CharSet using the set syntax.
153         * Each string is merged in with the set.</p>
154         *
155         * @param setStrs  Strings to merge into the initial set, may be null
156         * @return a CharSet instance
157         * @since 2.4
158         */
159        public static CharSet getInstance(String[] setStrs) {
160            if (setStrs == null) {
161                return null;
162            }
163            return new CharSet(setStrs); 
164        }
165    
166        //-----------------------------------------------------------------------
167        /**
168         * <p>Constructs a new CharSet using the set syntax.</p>
169         *
170         * @param setStr  the String describing the set, may be null
171         * @since 2.0
172         */
173        protected CharSet(String setStr) {
174            super();
175            add(setStr);
176        }
177    
178        /**
179         * <p>Constructs a new CharSet using the set syntax.
180         * Each string is merged in with the set.</p>
181         *
182         * @param set  Strings to merge into the initial set
183         * @throws NullPointerException if set is <code>null</code>
184         */
185        protected CharSet(String[] set) {
186            super();
187            int sz = set.length;
188            for (int i = 0; i < sz; i++) {
189                add(set[i]);
190            }
191        }
192    
193        //-----------------------------------------------------------------------
194        /**
195         * <p>Add a set definition string to the <code>CharSet</code>.</p>
196         *
197         * @param str  set definition string
198         */
199        protected void add(String str) {
200            if (str == null) {
201                return;
202            }
203    
204            int len = str.length();
205            int pos = 0;
206            while (pos < len) {
207                int remainder = (len - pos);
208                if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
209                    // negated range
210                    set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
211                    pos += 4;
212                } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
213                    // range
214                    set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
215                    pos += 3;
216                } else if (remainder >= 2 && str.charAt(pos) == '^') {
217                    // negated char
218                    set.add(CharRange.isNot(str.charAt(pos + 1)));
219                    pos += 2;
220                } else {
221                    // char
222                    set.add(CharRange.is(str.charAt(pos)));
223                    pos += 1;
224                }
225            }
226        }
227    
228        //-----------------------------------------------------------------------
229        /**
230         * <p>Gets the internal set as an array of CharRange objects.</p>
231         *
232         * @return an array of immutable CharRange objects
233         * @since 2.0
234         */
235        public CharRange[] getCharRanges() {
236            return (CharRange[]) set.toArray(new CharRange[set.size()]);
237        }
238    
239        //-----------------------------------------------------------------------
240        /**
241         * <p>Does the <code>CharSet</code> contain the specified
242         * character <code>ch</code>.</p>
243         *
244         * @param ch  the character to check for
245         * @return <code>true</code> if the set contains the characters
246         */
247        public boolean contains(char ch) {
248            for (Iterator it = set.iterator(); it.hasNext();) {
249                CharRange range = (CharRange) it.next();
250                if (range.contains(ch)) {
251                    return true;
252                }
253            }
254            return false;
255        }
256    
257        // Basics
258        //-----------------------------------------------------------------------
259        /**
260         * <p>Compares two CharSet objects, returning true if they represent
261         * exactly the same set of characters defined in the same way.</p>
262         *
263         * <p>The two sets <code>abc</code> and <code>a-c</code> are <i>not</i>
264         * equal according to this method.</p>
265         *
266         * @param obj  the object to compare to
267         * @return true if equal
268         * @since 2.0
269         */
270        public boolean equals(Object obj) {
271            if (obj == this) {
272                return true;
273            }
274            if (obj instanceof CharSet == false) {
275                return false;
276            }
277            CharSet other = (CharSet) obj;
278            return set.equals(other.set);
279        }
280    
281        /**
282         * <p>Gets a hashCode compatible with the equals method.</p>
283         *
284         * @return a suitable hashCode
285         * @since 2.0
286         */
287        public int hashCode() {
288            return 89 + set.hashCode();
289        }
290    
291        /**
292         * <p>Gets a string representation of the set.</p>
293         *
294         * @return string representation of the set
295         */
296        public String toString() {
297            return set.toString();
298        }
299    
300    }