001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.translate;
018
019import java.io.IOException;
020import java.io.Writer;
021import java.util.Arrays;
022import java.util.EnumSet;
023
024/**
025 * Translate XML numeric entities of the form &#[xX]?\d+;? to
026 * the specific codepoint.
027 *
028 * Note that the semi-colon is optional.
029 *
030 * @since 1.0
031 */
032public class NumericEntityUnescaper extends CharSequenceTranslator {
033
034    /** NumericEntityUnescaper option enum. */
035    public enum OPTION {
036
037        /**
038         * Require a semicolon.
039         */
040        semiColonRequired,
041
042        /**
043         * Do not require a semicolon.
044         */
045        semiColonOptional,
046
047        /**
048         * Throw an exception if a semi-colon is missing.
049         */
050        errorIfNoSemiColon
051    }
052
053    /** EnumSet of OPTIONS, given from the constructor. */
054    private final EnumSet<OPTION> options;
055
056    /**
057     * Create a UnicodeUnescaper.
058     *
059     * The constructor takes a list of options, only one type of which is currently
060     * available (whether to allow, error or ignore the semi-colon on the end of a
061     * numeric entity to being missing).
062     *
063     * For example, to support numeric entities without a ';':
064     *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
065     * and to throw an IllegalArgumentException when they're missing:
066     *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
067     *
068     * Note that the default behavior is to ignore them.
069     *
070     * @param options to apply to this unescaper
071     */
072    public NumericEntityUnescaper(final OPTION... options) {
073        if (options.length > 0) {
074            this.options = EnumSet.copyOf(Arrays.asList(options));
075        } else {
076            this.options = EnumSet.copyOf(Arrays.asList(OPTION.semiColonRequired));
077        }
078    }
079
080    /**
081     * Whether the passed in option is currently set.
082     *
083     * @param option to check state of
084     * @return whether the option is set
085     */
086    public boolean isSet(final OPTION option) {
087        return options != null && options.contains(option);
088    }
089
090    /**
091     * {@inheritDoc}
092     */
093    @Override
094    public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
095        final int seqEnd = input.length();
096        // Uses -2 to ensure there is something after the &#
097        if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
098            int start = index + 2;
099            boolean isHex = false;
100
101            final char firstChar = input.charAt(start);
102            if (firstChar == 'x' || firstChar == 'X') {
103                start++;
104                isHex = true;
105
106                // Check there's more than just an x after the &#
107                if (start == seqEnd) {
108                    return 0;
109                }
110            }
111
112            int end = start;
113            // Note that this supports character codes without a ; on the end
114            while (end < seqEnd && (input.charAt(end) >= '0' && input.charAt(end) <= '9'
115                                    || input.charAt(end) >= 'a' && input.charAt(end) <= 'f'
116                                    || input.charAt(end) >= 'A' && input.charAt(end) <= 'F')) {
117                end++;
118            }
119
120            final boolean semiNext = end != seqEnd && input.charAt(end) == ';';
121
122            if (!semiNext) {
123                if (isSet(OPTION.semiColonRequired)) {
124                    return 0;
125                }
126                if (isSet(OPTION.errorIfNoSemiColon)) {
127                    throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
128                }
129            }
130
131            int entityValue;
132            try {
133                if (isHex) {
134                    entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
135                } else {
136                    entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
137                }
138            } catch (final NumberFormatException nfe) {
139                return 0;
140            }
141
142            if (entityValue > 0xFFFF) {
143                final char[] chrs = Character.toChars(entityValue);
144                out.write(chrs[0]);
145                out.write(chrs[1]);
146            } else {
147                out.write(entityValue);
148            }
149
150            return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
151        }
152        return 0;
153    }
154}