NumericEntityUnescaper.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.translate;

  18. import java.io.IOException;
  19. import java.io.Writer;
  20. import java.util.Arrays;
  21. import java.util.EnumSet;

  22. /**
  23.  * Translate XML numeric entities of the form &#[xX]?\d+;? to
  24.  * the specific codepoint.
  25.  *
  26.  * Note that the semi-colon is optional.
  27.  *
  28.  * @since 1.0
  29.  */
  30. public class NumericEntityUnescaper extends CharSequenceTranslator {

  31.     public static enum OPTION { semiColonRequired, semiColonOptional, errorIfNoSemiColon }

  32.     // TODO?: Create an OptionsSet class to hide some of the conditional logic below
  33.     private final EnumSet<OPTION> options;

  34.     /**
  35.      * Create a UnicodeUnescaper.
  36.      *
  37.      * The constructor takes a list of options, only one type of which is currently
  38.      * available (whether to allow, error or ignore the semi-colon on the end of a
  39.      * numeric entity to being missing).
  40.      *
  41.      * For example, to support numeric entities without a ';':
  42.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
  43.      * and to throw an IllegalArgumentException when they're missing:
  44.      *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
  45.      *
  46.      * Note that the default behaviour is to ignore them.
  47.      *
  48.      * @param options to apply to this unescaper
  49.      */
  50.     public NumericEntityUnescaper(final OPTION... options) {
  51.         if(options.length > 0) {
  52.             this.options = EnumSet.copyOf(Arrays.asList(options));
  53.         } else {
  54.             this.options = EnumSet.copyOf(Arrays.asList(new OPTION[] { OPTION.semiColonRequired }));
  55.         }
  56.     }

  57.     /**
  58.      * Whether the passed in option is currently set.
  59.      *
  60.      * @param option to check state of
  61.      * @return whether the option is set
  62.      */
  63.     public boolean isSet(final OPTION option) {
  64.         return options == null ? false : options.contains(option);
  65.     }

  66.     /**
  67.      * {@inheritDoc}
  68.      */
  69.     @Override
  70.     public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
  71.         final int seqEnd = input.length();
  72.         // Uses -2 to ensure there is something after the &#
  73.         if(input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
  74.             int start = index + 2;
  75.             boolean isHex = false;

  76.             final char firstChar = input.charAt(start);
  77.             if(firstChar == 'x' || firstChar == 'X') {
  78.                 start++;
  79.                 isHex = true;

  80.                 // Check there's more than just an x after the &#
  81.                 if(start == seqEnd) {
  82.                     return 0;
  83.                 }
  84.             }

  85.             int end = start;
  86.             // Note that this supports character codes without a ; on the end
  87.             while(end < seqEnd && ( input.charAt(end) >= '0' && input.charAt(end) <= '9' ||
  88.                                     input.charAt(end) >= 'a' && input.charAt(end) <= 'f' ||
  89.                                     input.charAt(end) >= 'A' && input.charAt(end) <= 'F' ) )
  90.             {
  91.                 end++;
  92.             }

  93.             final boolean semiNext = end != seqEnd && input.charAt(end) == ';';

  94.             if(!semiNext) {
  95.                 if(isSet(OPTION.semiColonRequired)) {
  96.                     return 0;
  97.                 } else
  98.                 if(isSet(OPTION.errorIfNoSemiColon)) {
  99.                     throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
  100.                 }
  101.             }

  102.             int entityValue;
  103.             try {
  104.                 if(isHex) {
  105.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
  106.                 } else {
  107.                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
  108.                 }
  109.             } catch(final NumberFormatException nfe) {
  110.                 return 0;
  111.             }

  112.             if(entityValue > 0xFFFF) {
  113.                 final char[] chrs = Character.toChars(entityValue);
  114.                 out.write(chrs[0]);
  115.                 out.write(chrs[1]);
  116.             } else {
  117.                 out.write(entityValue);
  118.             }

  119.             return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
  120.         }
  121.         return 0;
  122.     }
  123. }