NumericEntityUnescaper.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.translate;

  18. import java.io.IOException;
  19. import java.io.Writer;
  20. import java.util.Arrays;
  21. import java.util.Collections;
  22. import java.util.EnumSet;

  23. import org.apache.commons.lang3.ArrayUtils;

  24. /**
  25.  * Translates XML numeric entities of the form &#[xX]?\d+;? to
  26.  * the specific code point.
  27.  *
  28.  * Note that the semicolon is optional.
  29.  *
  30.  * @since 1.0
  31.  */
  32. public class NumericEntityUnescaper extends CharSequenceTranslator {

  33.     /** Enumerates NumericEntityUnescaper options for unescaping. */
  34.     public enum OPTION {

  35.         /**
  36.          * Requires a semicolon.
  37.          */
  38.         semiColonRequired,

  39.         /**
  40.          * Does not require a semicolon.
  41.          */
  42.         semiColonOptional,

  43.         /**
  44.          * Throws an exception if a semicolon is missing.
  45.          */
  46.         errorIfNoSemiColon
  47.     }

  48.     /** Default options. */
  49.     private static final EnumSet<OPTION> DEFAULT_OPTIONS = EnumSet
  50.         .copyOf(Collections.singletonList(OPTION.semiColonRequired));

  51.     /** EnumSet of OPTIONS, given from the constructor, read-only. */
  52.     private final EnumSet<OPTION> options;

  53.     /**
  54.      * Creates a UnicodeUnescaper.
  55.      *
  56.      * The constructor takes a list of options, only one type of which is currently
  57.      * available (whether to allow, error or ignore the semicolon on the end of a
  58.      * numeric entity to being missing).
  59.      *
  60.      * For example, to support numeric entities without a ';':
  61.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
  62.      * and to throw an IllegalArgumentException when they're missing:
  63.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
  64.      *
  65.      * Note that the default behavior is to ignore them.
  66.      *
  67.      * @param options to apply to this unescaper
  68.      */
  69.     public NumericEntityUnescaper(final OPTION... options) {
  70.         this.options = ArrayUtils.isEmpty(options) ? DEFAULT_OPTIONS : EnumSet.copyOf(Arrays.asList(options));
  71.     }

  72.     /**
  73.      * Tests whether the passed in option is currently set.
  74.      *
  75.      * @param option to check state of
  76.      * @return whether the option is set
  77.      */
  78.     public boolean isSet(final OPTION option) {
  79.         return options.contains(option);
  80.     }

  81.     /**
  82.      * {@inheritDoc}
  83.      */
  84.     @Override
  85.     public int translate(final CharSequence input, final int index, final Writer writer) throws IOException {
  86.         final int seqEnd = input.length();
  87.         // Uses -2 to ensure there is something after the &#
  88.         if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
  89.             int start = index + 2;
  90.             boolean isHex = false;

  91.             final char firstChar = input.charAt(start);
  92.             if (firstChar == 'x' || firstChar == 'X') {
  93.                 start++;
  94.                 isHex = true;

  95.                 // Check there's more than just an x after the &#
  96.                 if (start == seqEnd) {
  97.                     return 0;
  98.                 }
  99.             }

  100.             int end = start;
  101.             // Note that this supports character codes without a ; on the end
  102.             while (end < seqEnd && (input.charAt(end) >= '0' && input.charAt(end) <= '9'
  103.                                     || input.charAt(end) >= 'a' && input.charAt(end) <= 'f'
  104.                                     || input.charAt(end) >= 'A' && input.charAt(end) <= 'F')) {
  105.                 end++;
  106.             }

  107.             final boolean semiNext = end != seqEnd && input.charAt(end) == ';';

  108.             if (!semiNext) {
  109.                 if (isSet(OPTION.semiColonRequired)) {
  110.                     return 0;
  111.                 }
  112.                 if (isSet(OPTION.errorIfNoSemiColon)) {
  113.                     throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
  114.                 }
  115.             }

  116.             final int entityValue;
  117.             try {
  118.                 if (isHex) {
  119.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
  120.                 } else {
  121.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
  122.                 }
  123.             } catch (final NumberFormatException nfe) {
  124.                 return 0;
  125.             }

  126.             if (entityValue > 0xFFFF) {
  127.                 final char[] chrs = Character.toChars(entityValue);
  128.                 writer.write(chrs[0]);
  129.                 writer.write(chrs[1]);
  130.             } else {
  131.                 writer.write(entityValue);
  132.             }

  133.             return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
  134.         }
  135.         return 0;
  136.     }
  137. }