NumericEntityUnescaper.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.lang3.text.translate;

  18. import java.io.IOException;
  19. import java.io.Writer;
  20. import java.util.Arrays;
  21. import java.util.Collections;
  22. import java.util.EnumSet;

  23. import org.apache.commons.lang3.CharUtils;

  24. /**
  25.  * Translate XML numeric entities of the form &#[xX]?\d+;? to the specific code point.
  26.  *
  27.  * Note that the semicolon is optional.
  28.  *
  29.  * @since 3.0
  30.  * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
  31.  *             <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/translate/NumericEntityUnescaper.html">
  32.  *             NumericEntityUnescaper</a>.
  33.  */
  34. @Deprecated
  35. public class NumericEntityUnescaper extends CharSequenceTranslator {

  36.     /**
  37.      * Enumerates NumericEntityUnescaper options for unescaping.
  38.      *
  39.      * @deprecated As of 3.18.0, use Apache Commons Text <a href=
  40.      *             "https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/translate/NumericEntityUnescaper.OPTION.html">
  41.      *             NumericEntityUnescaper.OPTION</a>.
  42.      */
  43.     @Deprecated
  44.     public enum OPTION {

  45.         /**
  46.          * Require a semicolon.
  47.          */
  48.         semiColonRequired,

  49.         /**
  50.          * Do not require a semicolon.
  51.          */
  52.         semiColonOptional,

  53.         /**
  54.          * Throw an exception if a semicolon is missing.
  55.          */
  56.         errorIfNoSemiColon
  57.     }

  58.     // TODO?: Create an OptionsSet class to hide some of the conditional logic below
  59.     private final EnumSet<OPTION> options;

  60.     /**
  61.      * Create a UnicodeUnescaper.
  62.      *
  63.      * The constructor takes a list of options, only one type of which is currently
  64.      * available (whether to allow, error or ignore the semicolon on the end of a
  65.      * numeric entity to being missing).
  66.      *
  67.      * For example, to support numeric entities without a ';':
  68.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
  69.      * and to throw an IllegalArgumentException when they're missing:
  70.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
  71.      *
  72.      * Note that the default behavior is to ignore them.
  73.      *
  74.      * @param options to apply to this unescaper
  75.      */
  76.     public NumericEntityUnescaper(final OPTION... options) {
  77.         if (options.length > 0) {
  78.             this.options = EnumSet.copyOf(Arrays.asList(options));
  79.         } else {
  80.             this.options = EnumSet.copyOf(Collections.singletonList(OPTION.semiColonRequired));
  81.         }
  82.     }

  83.     /**
  84.      * Tests whether the passed in option is currently set.
  85.      *
  86.      * @param option to check state of
  87.      * @return whether the option is set
  88.      */
  89.     public boolean isSet(final OPTION option) {
  90.         return options != null && options.contains(option);
  91.     }

  92.     /**
  93.      * {@inheritDoc}
  94.      */
  95.     @Override
  96.     public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
  97.         final int seqEnd = input.length();
  98.         // Uses -2 to ensure there is something after the &#
  99.         if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
  100.             int start = index + 2;
  101.             boolean isHex = false;

  102.             final char firstChar = input.charAt(start);
  103.             if (firstChar == 'x' || firstChar == 'X') {
  104.                 start++;
  105.                 isHex = true;

  106.                 // Check there's more than just an x after the &#
  107.                 if (start == seqEnd) {
  108.                     return 0;
  109.                 }
  110.             }

  111.             int end = start;
  112.             // Note that this supports character codes without a ; on the end
  113.             while (end < seqEnd && CharUtils.isHex(input.charAt(end))) {
  114.                 end++;
  115.             }

  116.             final boolean semiNext = end != seqEnd && input.charAt(end) == ';';

  117.             if (!semiNext) {
  118.                 if (isSet(OPTION.semiColonRequired)) {
  119.                     return 0;
  120.                 }
  121.                 if (isSet(OPTION.errorIfNoSemiColon)) {
  122.                     throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
  123.                 }
  124.             }

  125.             final int entityValue;
  126.             try {
  127.                 if (isHex) {
  128.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
  129.                 } else {
  130.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
  131.                 }
  132.             } catch (final NumberFormatException nfe) {
  133.                 return 0;
  134.             }

  135.             if (entityValue > 0xFFFF) {
  136.                 final char[] chars = Character.toChars(entityValue);
  137.                 out.write(chars[0]);
  138.                 out.write(chars[1]);
  139.             } else {
  140.                 out.write(entityValue);
  141.             }

  142.             return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
  143.         }
  144.         return 0;
  145.     }
  146. }