001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.nio.ByteBuffer;
021import java.util.BitSet;
022import org.apache.commons.codec.BinaryDecoder;
023import org.apache.commons.codec.BinaryEncoder;
024import org.apache.commons.codec.DecoderException;
025import org.apache.commons.codec.EncoderException;
026
027/**
028 * Implements the Percent-Encoding scheme, as described in HTTP 1.1 specification. For extensibility, an array of
029 * special US-ASCII characters can be specified in order to perform proper URI encoding for the different parts
030 * of the URI.
031 * <p>
032 * This class is immutable. It is also thread-safe besides using BitSet which is not thread-safe, but its public
033 * interface only call the access
034 * </p>
035 *
036 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">Percent-Encoding</a>
037 * @since 1.12
038 */
039public class PercentCodec implements BinaryEncoder, BinaryDecoder {
040
041    /**
042     * The escape character used by the Percent-Encoding in order to introduce an encoded character.
043     */
044
045    private static final byte ESCAPE_CHAR = '%';
046
047    /**
048     * The bit set used to store the character that should be always encoded
049     */
050    private final BitSet alwaysEncodeChars = new BitSet();
051
052    /**
053     * The flag defining if the space character should be encoded as '+'
054     */
055    private final boolean plusForSpace;
056
057    /**
058     * The minimum and maximum code of the bytes that is inserted in the bit set, used to prevent look-ups
059     */
060    private int alwaysEncodeCharsMin = Integer.MAX_VALUE, alwaysEncodeCharsMax = Integer.MIN_VALUE;
061
062    /**
063     * Constructs a Percent coded that will encode all the non US-ASCII characters using the Percent-Encoding
064     * while it will not encode all the US-ASCII characters, except for character '%' that is used as escape
065     * character for Percent-Encoding.
066     */
067    public PercentCodec() {
068        this.plusForSpace = false;
069        insertAlwaysEncodeChar(ESCAPE_CHAR);
070    }
071
072    /**
073     * Constructs a Percent codec by specifying the characters that belong to US-ASCII that should
074     * always be encoded. The rest US-ASCII characters will not be encoded, except for character '%' that
075     * is used as escape character for Percent-Encoding.
076     *
077     * @param alwaysEncodeChars the unsafe characters that should always be encoded
078     * @param plusForSpace      the flag defining if the space character should be encoded as '+'
079     */
080    public PercentCodec(final byte[] alwaysEncodeChars, final boolean plusForSpace) {
081        this.plusForSpace = plusForSpace;
082        insertAlwaysEncodeChars(alwaysEncodeChars);
083    }
084
085    /**
086     * Adds the byte array into a BitSet for faster lookup
087     *
088     * @param alwaysEncodeCharsArray
089     */
090    private void insertAlwaysEncodeChars(final byte[] alwaysEncodeCharsArray) {
091        if (alwaysEncodeCharsArray != null) {
092            for (final byte b : alwaysEncodeCharsArray) {
093                insertAlwaysEncodeChar(b);
094            }
095        }
096        insertAlwaysEncodeChar(ESCAPE_CHAR);
097    }
098
099    /**
100     * Inserts a single character into a BitSet and maintains the min and max of the characters of the
101     * {@code BitSet alwaysEncodeChars} in order to avoid look-ups when a byte is out of this range.
102     *
103     * @param b the byte that is candidate for min and max limit
104     */
105    private void insertAlwaysEncodeChar(final byte b) {
106        this.alwaysEncodeChars.set(b);
107        if (b < alwaysEncodeCharsMin) {
108            alwaysEncodeCharsMin = b;
109        }
110        if (b > alwaysEncodeCharsMax) {
111            alwaysEncodeCharsMax = b;
112        }
113    }
114
115    /**
116     * Percent-Encoding based on RFC 3986. The non US-ASCII characters are encoded, as well as the
117     * US-ASCII characters that are configured to be always encoded.
118     */
119    @Override
120    public byte[] encode(final byte[] bytes) throws EncoderException {
121        if (bytes == null) {
122            return null;
123        }
124
125        final int expectedEncodingBytes = expectedEncodingBytes(bytes);
126        final boolean willEncode = expectedEncodingBytes != bytes.length;
127        if (willEncode || (plusForSpace && containsSpace(bytes))) {
128            return doEncode(bytes, expectedEncodingBytes, willEncode);
129        }
130        return bytes;
131    }
132
133    private byte[] doEncode(final byte[] bytes, final int expectedLength, final boolean willEncode) {
134        final ByteBuffer buffer = ByteBuffer.allocate(expectedLength);
135        for (final byte b : bytes) {
136            if (willEncode && canEncode(b)) {
137                byte bb = b;
138                if (bb < 0) {
139                    bb = (byte) (256 + bb);
140                }
141                final char hex1 = Utils.hexDigit(bb >> 4);
142                final char hex2 = Utils.hexDigit(bb);
143                buffer.put(ESCAPE_CHAR);
144                buffer.put((byte) hex1);
145                buffer.put((byte) hex2);
146            } else {
147                if (plusForSpace && b == ' ') {
148                    buffer.put((byte) '+');
149                } else {
150                    buffer.put(b);
151                }
152            }
153        }
154        return buffer.array();
155    }
156
157    private int expectedEncodingBytes(final byte[] bytes) {
158        int byteCount = 0;
159        for (final byte b : bytes) {
160            byteCount += canEncode(b) ? 3: 1;
161        }
162        return byteCount;
163    }
164
165    private boolean containsSpace(final byte[] bytes) {
166        for (final byte b : bytes) {
167            if (b == ' ') {
168                return true;
169            }
170        }
171        return false;
172    }
173
174    private boolean canEncode(final byte c) {
175        return !isAsciiChar(c) || (inAlwaysEncodeCharsRange(c) && alwaysEncodeChars.get(c));
176    }
177
178    private boolean inAlwaysEncodeCharsRange(final byte c) {
179        return c >= alwaysEncodeCharsMin && c <= alwaysEncodeCharsMax;
180    }
181
182    private boolean isAsciiChar(final byte c) {
183        return c >= 0;
184    }
185
186    /**
187     * Decode bytes encoded with Percent-Encoding based on RFC 3986. The reverse process is performed in order to
188     * decode the encoded characters to Unicode.
189     */
190    @Override
191    public byte[] decode(final byte[] bytes) throws DecoderException {
192        if (bytes == null) {
193            return null;
194        }
195
196        final ByteBuffer buffer = ByteBuffer.allocate(expectedDecodingBytes(bytes));
197        for (int i = 0; i < bytes.length; i++) {
198            final byte b = bytes[i];
199            if (b == ESCAPE_CHAR) {
200                try {
201                    final int u = Utils.digit16(bytes[++i]);
202                    final int l = Utils.digit16(bytes[++i]);
203                    buffer.put((byte) ((u << 4) + l));
204                } catch (final ArrayIndexOutOfBoundsException e) {
205                    throw new DecoderException("Invalid percent decoding: ", e);
206                }
207            } else {
208                if (plusForSpace && b == '+') {
209                    buffer.put((byte) ' ');
210                } else {
211                    buffer.put(b);
212                }
213            }
214        }
215        return buffer.array();
216    }
217
218    private int expectedDecodingBytes(final byte[] bytes) {
219        int byteCount = 0;
220        for (int i = 0; i < bytes.length; ) {
221            final byte b = bytes[i];
222            i += b == ESCAPE_CHAR ? 3: 1;
223            byteCount++;
224        }
225        return byteCount;
226    }
227
228    /**
229     * Encodes an object into using the Percent-Encoding. Only byte[] objects are accepted.
230     *
231     * @param obj the object to encode
232     * @return the encoding result byte[] as Object
233     * @throws EncoderException if the object is not a byte array
234     */
235    @Override
236    public Object encode(final Object obj) throws EncoderException {
237        if (obj == null) {
238            return null;
239        } else if (obj instanceof byte[]) {
240            return encode((byte[]) obj);
241        } else {
242            throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent encoded");
243        }
244    }
245
246    /**
247     * Decodes a byte[] Object, whose bytes are encoded with Percent-Encoding.
248     *
249     * @param obj the object to decode
250     * @return the decoding result byte[] as Object
251     * @throws DecoderException if the object is not a byte array
252     */
253    @Override
254    public Object decode(final Object obj) throws DecoderException {
255        if (obj == null) {
256            return null;
257        } else if (obj instanceof byte[]) {
258            return decode((byte[]) obj);
259        } else {
260            throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent decoded");
261        }
262    }
263}