001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.nio.ByteBuffer;
021import java.util.BitSet;
022
023import org.apache.commons.codec.BinaryDecoder;
024import org.apache.commons.codec.BinaryEncoder;
025import org.apache.commons.codec.DecoderException;
026import org.apache.commons.codec.EncoderException;
027
028/**
029 * Implements the Percent-Encoding scheme, as described in HTTP 1.1 specification. For extensibility, an array of
030 * special US-ASCII characters can be specified in order to perform proper URI encoding for the different parts
031 * of the URI.
032 * <p>
033 * This class is immutable. It is also thread-safe besides using BitSet which is not thread-safe, but its public
034 * interface only call the access
035 * </p>
036 *
037 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">Percent-Encoding</a>
038 * @since 1.12
039 */
040public class PercentCodec implements BinaryEncoder, BinaryDecoder {
041
042    /**
043     * The escape character used by the Percent-Encoding in order to introduce an encoded character.
044     */
045
046    private static final byte ESCAPE_CHAR = '%';
047
048    /**
049     * The bit set used to store the character that should be always encoded
050     */
051    private final BitSet alwaysEncodeChars = new BitSet();
052
053    /**
054     * The flag defining if the space character should be encoded as '+'
055     */
056    private final boolean plusForSpace;
057
058    /**
059     * The minimum and maximum code of the bytes that is inserted in the bit set, used to prevent look-ups
060     */
061    private int alwaysEncodeCharsMin = Integer.MAX_VALUE, alwaysEncodeCharsMax = Integer.MIN_VALUE;
062
063    /**
064     * Constructs a Percent coded that will encode all the non US-ASCII characters using the Percent-Encoding
065     * while it will not encode all the US-ASCII characters, except for character '%' that is used as escape
066     * character for Percent-Encoding.
067     */
068    public PercentCodec() {
069        this.plusForSpace = false;
070        insertAlwaysEncodeChar(ESCAPE_CHAR);
071    }
072
073    /**
074     * Constructs a Percent codec by specifying the characters that belong to US-ASCII that should
075     * always be encoded. The rest US-ASCII characters will not be encoded, except for character '%' that
076     * is used as escape character for Percent-Encoding.
077     *
078     * @param alwaysEncodeChars the unsafe characters that should always be encoded
079     * @param plusForSpace      the flag defining if the space character should be encoded as '+'
080     */
081    public PercentCodec(final byte[] alwaysEncodeChars, final boolean plusForSpace) {
082        this.plusForSpace = plusForSpace;
083        insertAlwaysEncodeChars(alwaysEncodeChars);
084    }
085
086    private boolean canEncode(final byte c) {
087        return !isAsciiChar(c) || inAlwaysEncodeCharsRange(c) && alwaysEncodeChars.get(c);
088    }
089
090    private boolean containsSpace(final byte[] bytes) {
091        for (final byte b : bytes) {
092            if (b == ' ') {
093                return true;
094            }
095        }
096        return false;
097    }
098
099    /**
100     * Decode bytes encoded with Percent-Encoding based on RFC 3986. The reverse process is performed in order to
101     * decode the encoded characters to Unicode.
102     */
103    @Override
104    public byte[] decode(final byte[] bytes) throws DecoderException {
105        if (bytes == null) {
106            return null;
107        }
108
109        final ByteBuffer buffer = ByteBuffer.allocate(expectedDecodingBytes(bytes));
110        for (int i = 0; i < bytes.length; i++) {
111            final byte b = bytes[i];
112            if (b == ESCAPE_CHAR) {
113                try {
114                    final int u = Utils.digit16(bytes[++i]);
115                    final int l = Utils.digit16(bytes[++i]);
116                    buffer.put((byte) ((u << 4) + l));
117                } catch (final ArrayIndexOutOfBoundsException e) {
118                    throw new DecoderException("Invalid percent decoding: ", e);
119                }
120            } else if (plusForSpace && b == '+') {
121                buffer.put((byte) ' ');
122            } else {
123                buffer.put(b);
124            }
125        }
126        return buffer.array();
127    }
128
129    /**
130     * Decodes a byte[] Object, whose bytes are encoded with Percent-Encoding.
131     *
132     * @param obj the object to decode
133     * @return the decoding result byte[] as Object
134     * @throws DecoderException if the object is not a byte array
135     */
136    @Override
137    public Object decode(final Object obj) throws DecoderException {
138        if (obj == null) {
139            return null;
140        }
141        if (obj instanceof byte[]) {
142            return decode((byte[]) obj);
143        }
144        throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent decoded");
145    }
146
147    private byte[] doEncode(final byte[] bytes, final int expectedLength, final boolean willEncode) {
148        final ByteBuffer buffer = ByteBuffer.allocate(expectedLength);
149        for (final byte b : bytes) {
150            if (willEncode && canEncode(b)) {
151                byte bb = b;
152                if (bb < 0) {
153                    bb = (byte) (256 + bb);
154                }
155                final char hex1 = Utils.hexDigit(bb >> 4);
156                final char hex2 = Utils.hexDigit(bb);
157                buffer.put(ESCAPE_CHAR);
158                buffer.put((byte) hex1);
159                buffer.put((byte) hex2);
160            } else if (plusForSpace && b == ' ') {
161                buffer.put((byte) '+');
162            } else {
163                buffer.put(b);
164            }
165        }
166        return buffer.array();
167    }
168
169    /**
170     * Percent-Encoding based on RFC 3986. The non US-ASCII characters are encoded, as well as the
171     * US-ASCII characters that are configured to be always encoded.
172     */
173    @Override
174    public byte[] encode(final byte[] bytes) throws EncoderException {
175        if (bytes == null) {
176            return null;
177        }
178
179        final int expectedEncodingBytes = expectedEncodingBytes(bytes);
180        final boolean willEncode = expectedEncodingBytes != bytes.length;
181        if (willEncode || plusForSpace && containsSpace(bytes)) {
182            return doEncode(bytes, expectedEncodingBytes, willEncode);
183        }
184        return bytes;
185    }
186
187    /**
188     * Encodes an object into using the Percent-Encoding. Only byte[] objects are accepted.
189     *
190     * @param obj the object to encode
191     * @return the encoding result byte[] as Object
192     * @throws EncoderException if the object is not a byte array
193     */
194    @Override
195    public Object encode(final Object obj) throws EncoderException {
196        if (obj == null) {
197            return null;
198        }
199        if (obj instanceof byte[]) {
200            return encode((byte[]) obj);
201        }
202        throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent encoded");
203    }
204
205    private int expectedDecodingBytes(final byte[] bytes) {
206        int byteCount = 0;
207        for (int i = 0; i < bytes.length; ) {
208            final byte b = bytes[i];
209            i += b == ESCAPE_CHAR ? 3: 1;
210            byteCount++;
211        }
212        return byteCount;
213    }
214
215    private int expectedEncodingBytes(final byte[] bytes) {
216        int byteCount = 0;
217        for (final byte b : bytes) {
218            byteCount += canEncode(b) ? 3: 1;
219        }
220        return byteCount;
221    }
222
223    private boolean inAlwaysEncodeCharsRange(final byte c) {
224        return c >= alwaysEncodeCharsMin && c <= alwaysEncodeCharsMax;
225    }
226
227    /**
228     * Inserts a single character into a BitSet and maintains the min and max of the characters of the
229     * {@code BitSet alwaysEncodeChars} in order to avoid look-ups when a byte is out of this range.
230     *
231     * @param b the byte that is candidate for min and max limit
232     */
233    private void insertAlwaysEncodeChar(final byte b) {
234        if (b < 0) {
235            throw new IllegalArgumentException("byte must be >= 0");
236        }
237        this.alwaysEncodeChars.set(b);
238        if (b < alwaysEncodeCharsMin) {
239            alwaysEncodeCharsMin = b;
240        }
241        if (b > alwaysEncodeCharsMax) {
242            alwaysEncodeCharsMax = b;
243        }
244    }
245
246    /**
247     * Adds the byte array into a BitSet for faster lookup
248     *
249     * @param alwaysEncodeCharsArray
250     */
251    private void insertAlwaysEncodeChars(final byte[] alwaysEncodeCharsArray) {
252        if (alwaysEncodeCharsArray != null) {
253            for (final byte b : alwaysEncodeCharsArray) {
254                insertAlwaysEncodeChar(b);
255            }
256        }
257        insertAlwaysEncodeChar(ESCAPE_CHAR);
258    }
259
260    private boolean isAsciiChar(final byte c) {
261        return c >= 0;
262    }
263}