001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text.translate;
018
019import java.io.IOException;
020import java.io.Writer;
021import java.util.Arrays;
022import java.util.EnumSet;
023
024/**
025 * Translate XML numeric entities of the form &#[xX]?\d+;? to
026 * the specific codepoint.
027 *
028 * Note that the semi-colon is optional.
029 *
030 * @since 3.0
031 * @deprecated as of 3.6, use commons-text
032 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/translate/NumericEntityUnescaper.html">
033 * NumericEntityUnescaper</a> instead
034 */
035@Deprecated
036public class NumericEntityUnescaper extends CharSequenceTranslator {
037
038    public enum OPTION {
039        semiColonRequired, semiColonOptional, errorIfNoSemiColon
040    }
041
042    // TODO?: Create an OptionsSet class to hide some of the conditional logic below
043    private final EnumSet<OPTION> options;
044
045    /**
046     * Create a UnicodeUnescaper.
047     *
048     * The constructor takes a list of options, only one type of which is currently
049     * available (whether to allow, error or ignore the semi-colon on the end of a
050     * numeric entity to being missing).
051     *
052     * For example, to support numeric entities without a ';':
053     *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
054     * and to throw an IllegalArgumentException when they're missing:
055     *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
056     *
057     * Note that the default behaviour is to ignore them.
058     *
059     * @param options to apply to this unescaper
060     */
061    public NumericEntityUnescaper(final OPTION... options) {
062        if (options.length > 0) {
063            this.options = EnumSet.copyOf(Arrays.asList(options));
064        } else {
065            this.options = EnumSet.copyOf(Arrays.asList(OPTION.semiColonRequired));
066        }
067    }
068
069    /**
070     * Whether the passed in option is currently set.
071     *
072     * @param option to check state of
073     * @return whether the option is set
074     */
075    public boolean isSet(final OPTION option) {
076        return options != null && options.contains(option);
077    }
078
079    /**
080     * {@inheritDoc}
081     */
082    @Override
083    public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
084        final int seqEnd = input.length();
085        // Uses -2 to ensure there is something after the &#
086        if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
087            int start = index + 2;
088            boolean isHex = false;
089
090            final char firstChar = input.charAt(start);
091            if (firstChar == 'x' || firstChar == 'X') {
092                start++;
093                isHex = true;
094
095                // Check there's more than just an x after the &#
096                if (start == seqEnd) {
097                    return 0;
098                }
099            }
100
101            int end = start;
102            // Note that this supports character codes without a ; on the end
103            while (end < seqEnd && ( input.charAt(end) >= '0' && input.charAt(end) <= '9' ||
104                                    input.charAt(end) >= 'a' && input.charAt(end) <= 'f' ||
105                                    input.charAt(end) >= 'A' && input.charAt(end) <= 'F' ) ) {
106                end++;
107            }
108
109            final boolean semiNext = end != seqEnd && input.charAt(end) == ';';
110
111            if (!semiNext) {
112                if (isSet(OPTION.semiColonRequired)) {
113                    return 0;
114                } else
115                if (isSet(OPTION.errorIfNoSemiColon)) {
116                    throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
117                }
118            }
119
120            int entityValue;
121            try {
122                if (isHex) {
123                    entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
124                } else {
125                    entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
126                }
127            } catch(final NumberFormatException nfe) {
128                return 0;
129            }
130
131            if (entityValue > 0xFFFF) {
132                final char[] chars = Character.toChars(entityValue);
133                out.write(chars[0]);
134                out.write(chars[1]);
135            } else {
136                out.write(entityValue);
137            }
138
139            return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
140        }
141        return 0;
142    }
143}