1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.net;
19
20 import java.nio.ByteBuffer;
21 import java.util.BitSet;
22
23 import org.apache.commons.codec.BinaryDecoder;
24 import org.apache.commons.codec.BinaryEncoder;
25 import org.apache.commons.codec.DecoderException;
26 import org.apache.commons.codec.EncoderException;
27
28 /**
29 * Implements the Percent-Encoding scheme, as described in HTTP 1.1 specification. For extensibility, an array of
30 * special US-ASCII characters can be specified in order to perform proper URI encoding for the different parts
31 * of the URI.
32 * <p>
33 * This class is immutable. It is also thread-safe besides using BitSet which is not thread-safe, but its public
34 * interface only call the access
35 * </p>
36 *
37 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">Percent-Encoding</a>
38 * @since 1.12
39 */
40 public class PercentCodec implements BinaryEncoder, BinaryDecoder {
41
42 /**
43 * The escape character used by the Percent-Encoding in order to introduce an encoded character.
44 */
45 private static final byte ESCAPE_CHAR = '%';
46
47 /**
48 * The bit set used to store the character that should be always encoded
49 */
50 private final BitSet alwaysEncodeChars = new BitSet();
51
52 /**
53 * The flag defining if the space character should be encoded as '+'
54 */
55 private final boolean plusForSpace;
56
57 /**
58 * The minimum and maximum code of the bytes that is inserted in the bit set, used to prevent look-ups
59 */
60 private int alwaysEncodeCharsMin = Integer.MAX_VALUE, alwaysEncodeCharsMax = Integer.MIN_VALUE;
61
62 /**
63 * Constructs a Percent coded that will encode all the non US-ASCII characters using the Percent-Encoding
64 * while it will not encode all the US-ASCII characters, except for character '%' that is used as escape
65 * character for Percent-Encoding.
66 */
67 public PercentCodec() {
68 this.plusForSpace = false;
69 insertAlwaysEncodeChar(ESCAPE_CHAR);
70 }
71
72 /**
73 * Constructs a Percent codec by specifying the characters that belong to US-ASCII that should
74 * always be encoded. The rest US-ASCII characters will not be encoded, except for character '%' that
75 * is used as escape character for Percent-Encoding.
76 *
77 * @param alwaysEncodeChars the unsafe characters that should always be encoded
78 * @param plusForSpace the flag defining if the space character should be encoded as '+'
79 */
80 public PercentCodec(final byte[] alwaysEncodeChars, final boolean plusForSpace) {
81 this.plusForSpace = plusForSpace;
82 insertAlwaysEncodeChars(alwaysEncodeChars);
83 }
84
85 private boolean canEncode(final byte c) {
86 return !isAsciiChar(c) || inAlwaysEncodeCharsRange(c) && alwaysEncodeChars.get(c);
87 }
88
89 private boolean containsSpace(final byte[] bytes) {
90 for (final byte b : bytes) {
91 if (b == ' ') {
92 return true;
93 }
94 }
95 return false;
96 }
97
98 /**
99 * Decodes bytes encoded with Percent-Encoding based on RFC 3986. The reverse process is performed in order to
100 * decode the encoded characters to Unicode.
101 */
102 @Override
103 public byte[] decode(final byte[] bytes) throws DecoderException {
104 if (bytes == null) {
105 return null;
106 }
107 final ByteBuffer buffer = ByteBuffer.allocate(expectedDecodingBytes(bytes));
108 for (int i = 0; i < bytes.length; i++) {
109 final byte b = bytes[i];
110 if (b == ESCAPE_CHAR) {
111 try {
112 final int u = Utils.digit16(bytes[++i]);
113 final int l = Utils.digit16(bytes[++i]);
114 buffer.put((byte) ((u << 4) + l));
115 } catch (final ArrayIndexOutOfBoundsException e) {
116 throw new DecoderException("Invalid percent decoding: ", e);
117 }
118 } else if (plusForSpace && b == '+') {
119 buffer.put((byte) ' ');
120 } else {
121 buffer.put(b);
122 }
123 }
124 return buffer.array();
125 }
126
127 /**
128 * Decodes a byte[] Object, whose bytes are encoded with Percent-Encoding.
129 *
130 * @param obj the object to decode
131 * @return the decoding result byte[] as Object
132 * @throws DecoderException if the object is not a byte array
133 */
134 @Override
135 public Object decode(final Object obj) throws DecoderException {
136 if (obj == null) {
137 return null;
138 }
139 if (obj instanceof byte[]) {
140 return decode((byte[]) obj);
141 }
142 throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent decoded");
143 }
144
145 private byte[] doEncode(final byte[] bytes, final int expectedLength, final boolean willEncode) {
146 final ByteBuffer buffer = ByteBuffer.allocate(expectedLength);
147 for (final byte b : bytes) {
148 if (willEncode && canEncode(b)) {
149 byte bb = b;
150 if (bb < 0) {
151 bb = (byte) (256 + bb);
152 }
153 final char hex1 = Utils.hexChar(bb >> 4);
154 final char hex2 = Utils.hexChar(bb);
155 buffer.put(ESCAPE_CHAR);
156 buffer.put((byte) hex1);
157 buffer.put((byte) hex2);
158 } else if (plusForSpace && b == ' ') {
159 buffer.put((byte) '+');
160 } else {
161 buffer.put(b);
162 }
163 }
164 return buffer.array();
165 }
166
167 /**
168 * Percent-Encoding based on RFC 3986. The non US-ASCII characters are encoded, as well as the
169 * US-ASCII characters that are configured to be always encoded.
170 */
171 @Override
172 public byte[] encode(final byte[] bytes) throws EncoderException {
173 if (bytes == null) {
174 return null;
175 }
176 final int expectedEncodingBytes = expectedEncodingBytes(bytes);
177 final boolean willEncode = expectedEncodingBytes != bytes.length;
178 if (willEncode || plusForSpace && containsSpace(bytes)) {
179 return doEncode(bytes, expectedEncodingBytes, willEncode);
180 }
181 return bytes;
182 }
183
184 /**
185 * Encodes an object into using the Percent-Encoding. Only byte[] objects are accepted.
186 *
187 * @param obj the object to encode
188 * @return the encoding result byte[] as Object
189 * @throws EncoderException if the object is not a byte array
190 */
191 @Override
192 public Object encode(final Object obj) throws EncoderException {
193 if (obj == null) {
194 return null;
195 }
196 if (obj instanceof byte[]) {
197 return encode((byte[]) obj);
198 }
199 throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent encoded");
200 }
201
202 private int expectedDecodingBytes(final byte[] bytes) {
203 int byteCount = 0;
204 for (int i = 0; i < bytes.length;) {
205 final byte b = bytes[i];
206 i += b == ESCAPE_CHAR ? 3 : 1;
207 byteCount++;
208 }
209 return byteCount;
210 }
211
212 private int expectedEncodingBytes(final byte[] bytes) {
213 int byteCount = 0;
214 for (final byte b : bytes) {
215 byteCount += canEncode(b) ? 3 : 1;
216 }
217 return byteCount;
218 }
219
220 private boolean inAlwaysEncodeCharsRange(final byte c) {
221 return c >= alwaysEncodeCharsMin && c <= alwaysEncodeCharsMax;
222 }
223
224 /**
225 * Inserts a single character into a BitSet and maintains the min and max of the characters of the
226 * {@code BitSet alwaysEncodeChars} in order to avoid look-ups when a byte is out of this range.
227 *
228 * @param b the byte that is candidate for min and max limit
229 */
230 private void insertAlwaysEncodeChar(final byte b) {
231 if (b < 0) {
232 throw new IllegalArgumentException("byte must be >= 0");
233 }
234 this.alwaysEncodeChars.set(b);
235 if (b < alwaysEncodeCharsMin) {
236 alwaysEncodeCharsMin = b;
237 }
238 if (b > alwaysEncodeCharsMax) {
239 alwaysEncodeCharsMax = b;
240 }
241 }
242
243 /**
244 * Inserts the byte array into a BitSet for faster lookup.
245 *
246 * @param alwaysEncodeCharsArray
247 */
248 private void insertAlwaysEncodeChars(final byte[] alwaysEncodeCharsArray) {
249 if (alwaysEncodeCharsArray != null) {
250 for (final byte b : alwaysEncodeCharsArray) {
251 insertAlwaysEncodeChar(b);
252 }
253 }
254 insertAlwaysEncodeChar(ESCAPE_CHAR);
255 }
256
257 private boolean isAsciiChar(final byte c) {
258 return c >= 0;
259 }
260 }