001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.translate; 018 019import java.io.IOException; 020import java.io.Writer; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.EnumSet; 024 025import org.apache.commons.lang3.ArrayUtils; 026 027/** 028 * Translates XML numeric entities of the form &#[xX]?\d+;? to 029 * the specific code point. 030 * 031 * Note that the semicolon is optional. 032 * 033 * @since 1.0 034 */ 035public class NumericEntityUnescaper extends CharSequenceTranslator { 036 037 /** Enumerates NumericEntityUnescaper options for unescaping. */ 038 public enum OPTION { 039 040 /** 041 * Requires a semicolon. 042 */ 043 semiColonRequired, 044 045 /** 046 * Does not require a semicolon. 047 */ 048 semiColonOptional, 049 050 /** 051 * Throws an exception if a semicolon is missing. 052 */ 053 errorIfNoSemiColon 054 } 055 056 /** Default options. */ 057 private static final EnumSet<OPTION> DEFAULT_OPTIONS = EnumSet 058 .copyOf(Collections.singletonList(OPTION.semiColonRequired)); 059 060 /** EnumSet of OPTIONS, given from the constructor, read-only. */ 061 private final EnumSet<OPTION> options; 062 063 /** 064 * Creates a UnicodeUnescaper. 065 * 066 * The constructor takes a list of options, only one type of which is currently available (whether to allow, error or ignore the semicolon on the end of a 067 * numeric entity to being missing). 068 * <p> 069 * For example, to support numeric entities without a ';': 070 * </p> 071 * 072 * <pre> 073 * new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional) 074 * </pre> 075 * <p> 076 * and to throw an IllegalArgumentException when they're missing: 077 * </p> 078 * 079 * <pre> 080 * new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon) 081 * </pre> 082 * <p> 083 * Note that the default behavior is to ignore them. 084 * </p> 085 * 086 * @param options to apply to this unescaper 087 */ 088 public NumericEntityUnescaper(final OPTION... options) { 089 this.options = ArrayUtils.isEmpty(options) ? DEFAULT_OPTIONS : EnumSet.copyOf(Arrays.asList(options)); 090 } 091 092 /** 093 * Tests whether the passed in option is currently set. 094 * 095 * @param option to check state of. 096 * @return whether the option is set. 097 */ 098 public boolean isSet(final OPTION option) { 099 return options.contains(option); 100 } 101 102 /** 103 * {@inheritDoc} 104 */ 105 @Override 106 public int translate(final CharSequence input, final int index, final Writer writer) throws IOException { 107 final int seqEnd = input.length(); 108 // Uses -2 to ensure there is something after the &# 109 if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') { 110 int start = index + 2; 111 boolean isHex = false; 112 113 final char firstChar = input.charAt(start); 114 if (firstChar == 'x' || firstChar == 'X') { 115 start++; 116 isHex = true; 117 118 // Check there's more than just an x after the &# 119 if (start == seqEnd) { 120 return 0; 121 } 122 } 123 124 int end = start; 125 // Note that this supports character codes without a ; on the end 126 while (end < seqEnd && (input.charAt(end) >= '0' && input.charAt(end) <= '9' 127 || input.charAt(end) >= 'a' && input.charAt(end) <= 'f' 128 || input.charAt(end) >= 'A' && input.charAt(end) <= 'F')) { 129 end++; 130 } 131 132 final boolean semiNext = end != seqEnd && input.charAt(end) == ';'; 133 134 if (!semiNext) { 135 if (isSet(OPTION.semiColonRequired)) { 136 return 0; 137 } 138 if (isSet(OPTION.errorIfNoSemiColon)) { 139 throw new IllegalArgumentException("Semi-colon required at end of numeric entity"); 140 } 141 } 142 143 final int entityValue; 144 try { 145 if (isHex) { 146 entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16); 147 } else { 148 entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10); 149 } 150 } catch (final NumberFormatException nfe) { 151 return 0; 152 } 153 154 if (entityValue > 0xFFFF) { 155 final char[] chrs = Character.toChars(entityValue); 156 writer.write(chrs[0]); 157 writer.write(chrs[1]); 158 } else { 159 writer.write(entityValue); 160 } 161 162 return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0); 163 } 164 return 0; 165 } 166}