NumericEntityUnescaper.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.lang3.text.translate;

  18. import java.io.IOException;
  19. import java.io.Writer;
  20. import java.util.Arrays;
  21. import java.util.Collections;
  22. import java.util.EnumSet;

  23. /**
  24.  * Translate XML numeric entities of the form &#[xX]?\d+;? to
  25.  * the specific code point.
  26.  *
  27.  * Note that the semicolon is optional.
  28.  *
  29.  * @since 3.0
  30.  * @deprecated As of 3.6, use Apache Commons Text
  31.  * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/translate/NumericEntityUnescaper.html">
  32.  * NumericEntityUnescaper</a> instead
  33.  */
  34. @Deprecated
  35. public class NumericEntityUnescaper extends CharSequenceTranslator {

  36.     /** Enumerates NumericEntityUnescaper options for unescaping. */
  37.     public enum OPTION {

  38.         /**
  39.          * Require a semicolon.
  40.          */
  41.         semiColonRequired,

  42.         /**
  43.          * Do not require a semicolon.
  44.          */
  45.         semiColonOptional,

  46.         /**
  47.          * Throw an exception if a semicolon is missing.
  48.          */
  49.         errorIfNoSemiColon
  50.     }

  51.     // TODO?: Create an OptionsSet class to hide some of the conditional logic below
  52.     private final EnumSet<OPTION> options;

  53.     /**
  54.      * Create a UnicodeUnescaper.
  55.      *
  56.      * The constructor takes a list of options, only one type of which is currently
  57.      * available (whether to allow, error or ignore the semicolon on the end of a
  58.      * numeric entity to being missing).
  59.      *
  60.      * For example, to support numeric entities without a ';':
  61.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
  62.      * and to throw an IllegalArgumentException when they're missing:
  63.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
  64.      *
  65.      * Note that the default behavior is to ignore them.
  66.      *
  67.      * @param options to apply to this unescaper
  68.      */
  69.     public NumericEntityUnescaper(final OPTION... options) {
  70.         if (options.length > 0) {
  71.             this.options = EnumSet.copyOf(Arrays.asList(options));
  72.         } else {
  73.             this.options = EnumSet.copyOf(Collections.singletonList(OPTION.semiColonRequired));
  74.         }
  75.     }

  76.     /**
  77.      * Whether the passed in option is currently set.
  78.      *
  79.      * @param option to check state of
  80.      * @return whether the option is set
  81.      */
  82.     public boolean isSet(final OPTION option) {
  83.         return options != null && options.contains(option);
  84.     }

  85.     /**
  86.      * {@inheritDoc}
  87.      */
  88.     @Override
  89.     public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
  90.         final int seqEnd = input.length();
  91.         // Uses -2 to ensure there is something after the &#
  92.         if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
  93.             int start = index + 2;
  94.             boolean isHex = false;

  95.             final char firstChar = input.charAt(start);
  96.             if (firstChar == 'x' || firstChar == 'X') {
  97.                 start++;
  98.                 isHex = true;

  99.                 // Check there's more than just an x after the &#
  100.                 if (start == seqEnd) {
  101.                     return 0;
  102.                 }
  103.             }

  104.             int end = start;
  105.             // Note that this supports character codes without a ; on the end
  106.             while (end < seqEnd && ( input.charAt(end) >= '0' && input.charAt(end) <= '9' ||
  107.                                     input.charAt(end) >= 'a' && input.charAt(end) <= 'f' ||
  108.                                     input.charAt(end) >= 'A' && input.charAt(end) <= 'F' ) ) {
  109.                 end++;
  110.             }

  111.             final boolean semiNext = end != seqEnd && input.charAt(end) == ';';

  112.             if (!semiNext) {
  113.                 if (isSet(OPTION.semiColonRequired)) {
  114.                     return 0;
  115.                 }
  116.                 if (isSet(OPTION.errorIfNoSemiColon)) {
  117.                     throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
  118.                 }
  119.             }

  120.             final int entityValue;
  121.             try {
  122.                 if (isHex) {
  123.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
  124.                 } else {
  125.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
  126.                 }
  127.             } catch (final NumberFormatException nfe) {
  128.                 return 0;
  129.             }

  130.             if (entityValue > 0xFFFF) {
  131.                 final char[] chars = Character.toChars(entityValue);
  132.                 out.write(chars[0]);
  133.                 out.write(chars[1]);
  134.             } else {
  135.                 out.write(entityValue);
  136.             }

  137.             return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
  138.         }
  139.         return 0;
  140.     }
  141. }