001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang;
018    
019    import java.io.Serializable;
020    import java.util.Collections;
021    import java.util.HashMap;
022    import java.util.HashSet;
023    import java.util.Iterator;
024    import java.util.Map;
025    import java.util.Set;
026    
027    /**
028     * <p>A set of characters.</p>
029     *
030     * <p>Instances are immutable, but instances of subclasses may not be.</p>
031     *
032     * @author Apache Software Foundation
033     * @author Phil Steitz
034     * @author Pete Gieser
035     * @author Gary Gregory
036     * @since 1.0
037     * @version $Id: CharSet.java 905671 2010-02-02 15:25:14Z niallp $
038     */
039    public class CharSet implements Serializable {
040    
041        /**
042         * Required for serialization support. Lang version 2.0. 
043         * 
044         * @see java.io.Serializable
045         */
046        private static final long serialVersionUID = 5947847346149275958L;
047    
048        /** 
049         * A CharSet defining no characters. 
050         * @since 2.0
051         */
052        public static final CharSet EMPTY = new CharSet((String) null);
053    
054        /** 
055         * A CharSet defining ASCII alphabetic characters "a-zA-Z".
056         * @since 2.0
057         */
058        public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
059    
060        /** 
061         * A CharSet defining ASCII alphabetic characters "a-z".
062         * @since 2.0
063         */
064        public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
065    
066        /** 
067         * A CharSet defining ASCII alphabetic characters "A-Z".
068         * @since 2.0
069         */
070        public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
071    
072        /** 
073         * A CharSet defining ASCII alphabetic characters "0-9".
074         * @since 2.0
075         */
076        public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
077    
078        /**
079         * A Map of the common cases used in the factory.
080         * Subclasses can add more common patterns if desired
081         * @since 2.0
082         */
083        protected static final Map COMMON = Collections.synchronizedMap(new HashMap());
084        
085        static {
086            COMMON.put(null, EMPTY);
087            COMMON.put("", EMPTY);
088            COMMON.put("a-zA-Z", ASCII_ALPHA);
089            COMMON.put("A-Za-z", ASCII_ALPHA);
090            COMMON.put("a-z", ASCII_ALPHA_LOWER);
091            COMMON.put("A-Z", ASCII_ALPHA_UPPER);
092            COMMON.put("0-9", ASCII_NUMERIC);
093        }
094    
095        /** The set of CharRange objects. */
096        private final Set set = new HashSet();
097    
098        //-----------------------------------------------------------------------
099        /**
100         * <p>Factory method to create a new CharSet using a special syntax.</p>
101         *
102         * <ul>
103         *  <li><code>null</code> or empty string ("")
104         * - set containing no characters</li>
105         *  <li>Single character, such as "a"
106         *  - set containing just that character</li>
107         *  <li>Multi character, such as "a-e"
108         *  - set containing characters from one character to the other</li>
109         *  <li>Negated, such as "^a" or "^a-e"
110         *  - set containing all characters except those defined</li>
111         *  <li>Combinations, such as "abe-g"
112         *  - set containing all the characters from the individual sets</li>
113         * </ul>
114         *
115         * <p>The matching order is:</p>
116         * <ol>
117         *  <li>Negated multi character range, such as "^a-e"
118         *  <li>Ordinary multi character range, such as "a-e"
119         *  <li>Negated single character, such as "^a"
120         *  <li>Ordinary single character, such as "a"
121         * </ol>
122         * <p>Matching works left to right. Once a match is found the
123         * search starts again from the next character.</p>
124         *
125         * <p>If the same range is defined twice using the same syntax, only
126         * one range will be kept.
127         * Thus, "a-ca-c" creates only one range of "a-c".</p>
128         *
129         * <p>If the start and end of a range are in the wrong order,
130         * they are reversed. Thus "a-e" is the same as "e-a".
131         * As a result, "a-ee-a" would create only one range,
132         * as the "a-e" and "e-a" are the same.</p>
133         *
134         * <p>The set of characters represented is the union of the specified ranges.</p>
135         *
136         * <p>All CharSet objects returned by this method will be immutable.</p>
137         *
138         * @param setStr  the String describing the set, may be null
139         * @return a CharSet instance
140         * @since 2.0
141         */
142        public static CharSet getInstance(String setStr) {
143            Object set = COMMON.get(setStr);
144            if (set != null) {
145                return (CharSet) set;
146            }
147            return new CharSet(setStr);
148        }
149    
150        /**
151         * <p>Constructs a new CharSet using the set syntax.
152         * Each string is merged in with the set.</p>
153         *
154         * @param setStrs  Strings to merge into the initial set, may be null
155         * @return a CharSet instance
156         * @since 2.4
157         */
158        public static CharSet getInstance(String[] setStrs) {
159            if (setStrs == null) {
160                return null;
161            }
162            return new CharSet(setStrs); 
163        }
164    
165        //-----------------------------------------------------------------------
166        /**
167         * <p>Constructs a new CharSet using the set syntax.</p>
168         *
169         * @param setStr  the String describing the set, may be null
170         * @since 2.0
171         */
172        protected CharSet(String setStr) {
173            super();
174            add(setStr);
175        }
176    
177        /**
178         * <p>Constructs a new CharSet using the set syntax.
179         * Each string is merged in with the set.</p>
180         *
181         * @param set  Strings to merge into the initial set
182         * @throws NullPointerException if set is <code>null</code>
183         */
184        protected CharSet(String[] set) {
185            super();
186            int sz = set.length;
187            for (int i = 0; i < sz; i++) {
188                add(set[i]);
189            }
190        }
191    
192        //-----------------------------------------------------------------------
193        /**
194         * <p>Add a set definition string to the <code>CharSet</code>.</p>
195         *
196         * @param str  set definition string
197         */
198        protected void add(String str) {
199            if (str == null) {
200                return;
201            }
202    
203            int len = str.length();
204            int pos = 0;
205            while (pos < len) {
206                int remainder = (len - pos);
207                if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
208                    // negated range
209                    set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
210                    pos += 4;
211                } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
212                    // range
213                    set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
214                    pos += 3;
215                } else if (remainder >= 2 && str.charAt(pos) == '^') {
216                    // negated char
217                    set.add(CharRange.isNot(str.charAt(pos + 1)));
218                    pos += 2;
219                } else {
220                    // char
221                    set.add(CharRange.is(str.charAt(pos)));
222                    pos += 1;
223                }
224            }
225        }
226    
227        //-----------------------------------------------------------------------
228        /**
229         * <p>Gets the internal set as an array of CharRange objects.</p>
230         *
231         * @return an array of immutable CharRange objects
232         * @since 2.0
233         */
234        public CharRange[] getCharRanges() {
235            return (CharRange[]) set.toArray(new CharRange[set.size()]);
236        }
237    
238        //-----------------------------------------------------------------------
239        /**
240         * <p>Does the <code>CharSet</code> contain the specified
241         * character <code>ch</code>.</p>
242         *
243         * @param ch  the character to check for
244         * @return <code>true</code> if the set contains the characters
245         */
246        public boolean contains(char ch) {
247            for (Iterator it = set.iterator(); it.hasNext();) {
248                CharRange range = (CharRange) it.next();
249                if (range.contains(ch)) {
250                    return true;
251                }
252            }
253            return false;
254        }
255    
256        // Basics
257        //-----------------------------------------------------------------------
258        /**
259         * <p>Compares two CharSet objects, returning true if they represent
260         * exactly the same set of characters defined in the same way.</p>
261         *
262         * <p>The two sets <code>abc</code> and <code>a-c</code> are <i>not</i>
263         * equal according to this method.</p>
264         *
265         * @param obj  the object to compare to
266         * @return true if equal
267         * @since 2.0
268         */
269        public boolean equals(Object obj) {
270            if (obj == this) {
271                return true;
272            }
273            if (obj instanceof CharSet == false) {
274                return false;
275            }
276            CharSet other = (CharSet) obj;
277            return set.equals(other.set);
278        }
279    
280        /**
281         * <p>Gets a hashCode compatible with the equals method.</p>
282         *
283         * @return a suitable hashCode
284         * @since 2.0
285         */
286        public int hashCode() {
287            return 89 + set.hashCode();
288        }
289    
290        /**
291         * <p>Gets a string representation of the set.</p>
292         *
293         * @return string representation of the set
294         */
295        public String toString() {
296            return set.toString();
297        }
298    
299    }