CharSet.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.lang3;

  18. import java.io.Serializable;
  19. import java.util.Collections;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.Map;
  23. import java.util.Set;
  24. import java.util.stream.Stream;

  25. /**
  26.  * A set of characters.
  27.  *
  28.  * <p>Instances are immutable, but instances of subclasses may not be.</p>
  29.  *
  30.  * <p>#ThreadSafe#</p>
  31.  * @since 1.0
  32.  */
  33. public class CharSet implements Serializable {

  34.     /**
  35.      * Required for serialization support. Lang version 2.0.
  36.      *
  37.      * @see java.io.Serializable
  38.      */
  39.     private static final long serialVersionUID = 5947847346149275958L;

  40.     /**
  41.      * A CharSet defining no characters.
  42.      * @since 2.0
  43.      */
  44.     public static final CharSet EMPTY = new CharSet((String) null);

  45.     /**
  46.      * A CharSet defining ASCII alphabetic characters "a-zA-Z".
  47.      * @since 2.0
  48.      */
  49.     public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");

  50.     /**
  51.      * A CharSet defining ASCII alphabetic characters "a-z".
  52.      * @since 2.0
  53.      */
  54.     public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");

  55.     /**
  56.      * A CharSet defining ASCII alphabetic characters "A-Z".
  57.      * @since 2.0
  58.      */
  59.     public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");

  60.     /**
  61.      * A CharSet defining ASCII alphabetic characters "0-9".
  62.      * @since 2.0
  63.      */
  64.     public static final CharSet ASCII_NUMERIC = new CharSet("0-9");

  65.     /**
  66.      * A Map of the common cases used in the factory.
  67.      * Subclasses can add more common patterns if desired
  68.      * @since 2.0
  69.      */
  70.     protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());

  71.     static {
  72.         COMMON.put(null, EMPTY);
  73.         COMMON.put(StringUtils.EMPTY, EMPTY);
  74.         COMMON.put("a-zA-Z", ASCII_ALPHA);
  75.         COMMON.put("A-Za-z", ASCII_ALPHA);
  76.         COMMON.put("a-z", ASCII_ALPHA_LOWER);
  77.         COMMON.put("A-Z", ASCII_ALPHA_UPPER);
  78.         COMMON.put("0-9", ASCII_NUMERIC);
  79.     }

  80.     /**
  81.      * Factory method to create a new CharSet using a special syntax.
  82.      *
  83.      * <ul>
  84.      *  <li>{@code null} or empty string ("")
  85.      * - set containing no characters</li>
  86.      *  <li>Single character, such as "a"
  87.      *  - set containing just that character</li>
  88.      *  <li>Multi character, such as "a-e"
  89.      *  - set containing characters from one character to the other</li>
  90.      *  <li>Negated, such as "^a" or "^a-e"
  91.      *  - set containing all characters except those defined</li>
  92.      *  <li>Combinations, such as "abe-g"
  93.      *  - set containing all the characters from the individual sets</li>
  94.      * </ul>
  95.      *
  96.      * <p>The matching order is:</p>
  97.      * <ol>
  98.      *  <li>Negated multi character range, such as "^a-e"
  99.      *  <li>Ordinary multi character range, such as "a-e"
  100.      *  <li>Negated single character, such as "^a"
  101.      *  <li>Ordinary single character, such as "a"
  102.      * </ol>
  103.      *
  104.      * <p>Matching works left to right. Once a match is found the
  105.      * search starts again from the next character.</p>
  106.      *
  107.      * <p>If the same range is defined twice using the same syntax, only
  108.      * one range will be kept.
  109.      * Thus, "a-ca-c" creates only one range of "a-c".</p>
  110.      *
  111.      * <p>If the start and end of a range are in the wrong order,
  112.      * they are reversed. Thus "a-e" is the same as "e-a".
  113.      * As a result, "a-ee-a" would create only one range,
  114.      * as the "a-e" and "e-a" are the same.</p>
  115.      *
  116.      * <p>The set of characters represented is the union of the specified ranges.</p>
  117.      *
  118.      * <p>There are two ways to add a literal negation character ({@code ^}):</p>
  119.      * <ul>
  120.      *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
  121.      *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
  122.      * </ul>
  123.      *
  124.      * <p>Examples using the negation character:</p>
  125.      * <pre>
  126.      *     CharSet.getInstance("^a-c").contains('a') = false
  127.      *     CharSet.getInstance("^a-c").contains('d') = true
  128.      *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
  129.      *     CharSet.getInstance("^^a-c").contains('^') = false
  130.      *     CharSet.getInstance("^a-cd-f").contains('d') = true
  131.      *     CharSet.getInstance("a-c^").contains('^') = true
  132.      *     CharSet.getInstance("^", "a-c").contains('^') = true
  133.      * </pre>
  134.      *
  135.      * <p>All CharSet objects returned by this method will be immutable.</p>
  136.      *
  137.      * @param setStrs  Strings to merge into the set, may be null
  138.      * @return a CharSet instance
  139.      * @since 2.4
  140.      */
  141.     public static CharSet getInstance(final String... setStrs) {
  142.         if (setStrs == null) {
  143.             return null;
  144.         }
  145.         if (setStrs.length == 1) {
  146.             final CharSet common = COMMON.get(setStrs[0]);
  147.             if (common != null) {
  148.                 return common;
  149.             }
  150.         }
  151.         return new CharSet(setStrs);
  152.     }

  153.     /** The set of CharRange objects. */
  154.     private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>());

  155.     /**
  156.      * Constructs a new CharSet using the set syntax.
  157.      * Each string is merged in with the set.
  158.      *
  159.      * @param set  Strings to merge into the initial set
  160.      * @throws NullPointerException if set is {@code null}
  161.      */
  162.     protected CharSet(final String... set) {
  163.         Stream.of(set).forEach(this::add);
  164.     }

  165.     /**
  166.      * Add a set definition string to the {@link CharSet}.
  167.      *
  168.      * @param str  set definition string
  169.      */
  170.     protected void add(final String str) {
  171.         if (str == null) {
  172.             return;
  173.         }

  174.         final int len = str.length();
  175.         int pos = 0;
  176.         while (pos < len) {
  177.             final int remainder = len - pos;
  178.             if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
  179.                 // negated range
  180.                 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
  181.                 pos += 4;
  182.             } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
  183.                 // range
  184.                 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
  185.                 pos += 3;
  186.             } else if (remainder >= 2 && str.charAt(pos) == '^') {
  187.                 // negated char
  188.                 set.add(CharRange.isNot(str.charAt(pos + 1)));
  189.                 pos += 2;
  190.             } else {
  191.                 // char
  192.                 set.add(CharRange.is(str.charAt(pos)));
  193.                 pos += 1;
  194.             }
  195.         }
  196.     }

  197.     /**
  198.      * Does the {@link CharSet} contain the specified
  199.      * character {@code ch}.
  200.      *
  201.      * @param ch  the character to check for
  202.      * @return {@code true} if the set contains the characters
  203.      */
  204.     public boolean contains(final char ch) {
  205.         synchronized (set) {
  206.             return set.stream().anyMatch(range -> range.contains(ch));
  207.         }
  208.     }

  209.     // Basics
  210.     /**
  211.      * Compares two {@link CharSet} objects, returning true if they represent
  212.      * exactly the same set of characters defined in the same way.
  213.      *
  214.      * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
  215.      * equal according to this method.</p>
  216.      *
  217.      * @param obj  the object to compare to
  218.      * @return true if equal
  219.      * @since 2.0
  220.      */
  221.     @Override
  222.     public boolean equals(final Object obj) {
  223.         if (obj == this) {
  224.             return true;
  225.         }
  226.         if (!(obj instanceof CharSet)) {
  227.             return false;
  228.         }
  229.         final CharSet other = (CharSet) obj;
  230.         return set.equals(other.set);
  231.     }

  232.     /**
  233.      * Gets the internal set as an array of CharRange objects.
  234.      *
  235.      * @return an array of immutable CharRange objects
  236.      * @since 2.0
  237.      */
  238. // NOTE: This is no longer public as CharRange is no longer a public class.
  239. //       It may be replaced when CharSet moves to Range.
  240.     /*public*/ CharRange[] getCharRanges() {
  241.         return set.toArray(CharRange.EMPTY_ARRAY);
  242.     }

  243.     /**
  244.      * Gets a hash code compatible with the equals method.
  245.      *
  246.      * @return a suitable hash code
  247.      * @since 2.0
  248.      */
  249.     @Override
  250.     public int hashCode() {
  251.         return 89 + set.hashCode();
  252.     }

  253.     /**
  254.      * Gets a string representation of the set.
  255.      *
  256.      * @return string representation of the set
  257.      */
  258.     @Override
  259.     public String toString() {
  260.         return set.toString();
  261.     }

  262. }