View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.net;
19  
20  import java.nio.ByteBuffer;
21  import java.util.BitSet;
22  
23  import org.apache.commons.codec.BinaryDecoder;
24  import org.apache.commons.codec.BinaryEncoder;
25  import org.apache.commons.codec.DecoderException;
26  import org.apache.commons.codec.EncoderException;
27  
28  /**
29   * Implements the Percent-Encoding scheme, as described in HTTP 1.1 specification. For extensibility, an array of
30   * special US-ASCII characters can be specified in order to perform proper URI encoding for the different parts
31   * of the URI.
32   * <p>
33   * This class is immutable. It is also thread-safe besides using BitSet which is not thread-safe, but its public
34   * interface only call the access
35   * </p>
36   *
37   * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">Percent-Encoding</a>
38   * @since 1.12
39   */
40  public class PercentCodec implements BinaryEncoder, BinaryDecoder {
41  
42      /**
43       * The escape character used by the Percent-Encoding in order to introduce an encoded character.
44       */
45      private static final byte ESCAPE_CHAR = '%';
46  
47      /**
48       * The bit set used to store the character that should be always encoded
49       */
50      private final BitSet alwaysEncodeChars = new BitSet();
51  
52      /**
53       * The flag defining if the space character should be encoded as '+'
54       */
55      private final boolean plusForSpace;
56  
57      /**
58       * The minimum and maximum code of the bytes that is inserted in the bit set, used to prevent look-ups
59       */
60      private int alwaysEncodeCharsMin = Integer.MAX_VALUE, alwaysEncodeCharsMax = Integer.MIN_VALUE;
61  
62      /**
63       * Constructs a Percent coded that will encode all the non US-ASCII characters using the Percent-Encoding
64       * while it will not encode all the US-ASCII characters, except for character '%' that is used as escape
65       * character for Percent-Encoding.
66       */
67      public PercentCodec() {
68          this.plusForSpace = false;
69          insertAlwaysEncodeChar(ESCAPE_CHAR);
70      }
71  
72      /**
73       * Constructs a Percent codec by specifying the characters that belong to US-ASCII that should
74       * always be encoded. The rest US-ASCII characters will not be encoded, except for character '%' that
75       * is used as escape character for Percent-Encoding.
76       *
77       * @param alwaysEncodeChars the unsafe characters that should always be encoded
78       * @param plusForSpace      the flag defining if the space character should be encoded as '+'
79       */
80      public PercentCodec(final byte[] alwaysEncodeChars, final boolean plusForSpace) {
81          this.plusForSpace = plusForSpace;
82          insertAlwaysEncodeChars(alwaysEncodeChars);
83      }
84  
85      private boolean canEncode(final byte c) {
86          return !isAsciiChar(c) || inAlwaysEncodeCharsRange(c) && alwaysEncodeChars.get(c);
87      }
88  
89      private boolean containsSpace(final byte[] bytes) {
90          for (final byte b : bytes) {
91              if (b == ' ') {
92                  return true;
93              }
94          }
95          return false;
96      }
97  
98      /**
99       * Decodes bytes encoded with Percent-Encoding based on RFC 3986. The reverse process is performed in order to
100      * decode the encoded characters to Unicode.
101      */
102     @Override
103     public byte[] decode(final byte[] bytes) throws DecoderException {
104         if (bytes == null) {
105             return null;
106         }
107         final ByteBuffer buffer = ByteBuffer.allocate(expectedDecodingBytes(bytes));
108         for (int i = 0; i < bytes.length; i++) {
109             final byte b = bytes[i];
110             if (b == ESCAPE_CHAR) {
111                 try {
112                     final int u = Utils.digit16(bytes[++i]);
113                     final int l = Utils.digit16(bytes[++i]);
114                     buffer.put((byte) ((u << 4) + l));
115                 } catch (final ArrayIndexOutOfBoundsException e) {
116                     throw new DecoderException("Invalid percent decoding: ", e);
117                 }
118             } else if (plusForSpace && b == '+') {
119                 buffer.put((byte) ' ');
120             } else {
121                 buffer.put(b);
122             }
123         }
124         return buffer.array();
125     }
126 
127     /**
128      * Decodes a byte[] Object, whose bytes are encoded with Percent-Encoding.
129      *
130      * @param obj the object to decode
131      * @return the decoding result byte[] as Object
132      * @throws DecoderException if the object is not a byte array
133      */
134     @Override
135     public Object decode(final Object obj) throws DecoderException {
136         if (obj == null) {
137             return null;
138         }
139         if (obj instanceof byte[]) {
140             return decode((byte[]) obj);
141         }
142         throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent decoded");
143     }
144 
145     private byte[] doEncode(final byte[] bytes, final int expectedLength, final boolean willEncode) {
146         final ByteBuffer buffer = ByteBuffer.allocate(expectedLength);
147         for (final byte b : bytes) {
148             if (willEncode && canEncode(b)) {
149                 byte bb = b;
150                 if (bb < 0) {
151                     bb = (byte) (256 + bb);
152                 }
153                 final char hex1 = Utils.hexDigit(bb >> 4);
154                 final char hex2 = Utils.hexDigit(bb);
155                 buffer.put(ESCAPE_CHAR);
156                 buffer.put((byte) hex1);
157                 buffer.put((byte) hex2);
158             } else if (plusForSpace && b == ' ') {
159                 buffer.put((byte) '+');
160             } else {
161                 buffer.put(b);
162             }
163         }
164         return buffer.array();
165     }
166 
167     /**
168      * Percent-Encoding based on RFC 3986. The non US-ASCII characters are encoded, as well as the
169      * US-ASCII characters that are configured to be always encoded.
170      */
171     @Override
172     public byte[] encode(final byte[] bytes) throws EncoderException {
173         if (bytes == null) {
174             return null;
175         }
176         final int expectedEncodingBytes = expectedEncodingBytes(bytes);
177         final boolean willEncode = expectedEncodingBytes != bytes.length;
178         if (willEncode || plusForSpace && containsSpace(bytes)) {
179             return doEncode(bytes, expectedEncodingBytes, willEncode);
180         }
181         return bytes;
182     }
183 
184     /**
185      * Encodes an object into using the Percent-Encoding. Only byte[] objects are accepted.
186      *
187      * @param obj the object to encode
188      * @return the encoding result byte[] as Object
189      * @throws EncoderException if the object is not a byte array
190      */
191     @Override
192     public Object encode(final Object obj) throws EncoderException {
193         if (obj == null) {
194             return null;
195         }
196         if (obj instanceof byte[]) {
197             return encode((byte[]) obj);
198         }
199         throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent encoded");
200     }
201 
202     private int expectedDecodingBytes(final byte[] bytes) {
203         int byteCount = 0;
204         for (int i = 0; i < bytes.length;) {
205             final byte b = bytes[i];
206             i += b == ESCAPE_CHAR ? 3 : 1;
207             byteCount++;
208         }
209         return byteCount;
210     }
211 
212     private int expectedEncodingBytes(final byte[] bytes) {
213         int byteCount = 0;
214         for (final byte b : bytes) {
215             byteCount += canEncode(b) ? 3 : 1;
216         }
217         return byteCount;
218     }
219 
220     private boolean inAlwaysEncodeCharsRange(final byte c) {
221         return c >= alwaysEncodeCharsMin && c <= alwaysEncodeCharsMax;
222     }
223 
224     /**
225      * Inserts a single character into a BitSet and maintains the min and max of the characters of the
226      * {@code BitSet alwaysEncodeChars} in order to avoid look-ups when a byte is out of this range.
227      *
228      * @param b the byte that is candidate for min and max limit
229      */
230     private void insertAlwaysEncodeChar(final byte b) {
231         if (b < 0) {
232             throw new IllegalArgumentException("byte must be >= 0");
233         }
234         this.alwaysEncodeChars.set(b);
235         if (b < alwaysEncodeCharsMin) {
236             alwaysEncodeCharsMin = b;
237         }
238         if (b > alwaysEncodeCharsMax) {
239             alwaysEncodeCharsMax = b;
240         }
241     }
242 
243     /**
244      * Inserts the byte array into a BitSet for faster lookup.
245      *
246      * @param alwaysEncodeCharsArray
247      */
248     private void insertAlwaysEncodeChars(final byte[] alwaysEncodeCharsArray) {
249         if (alwaysEncodeCharsArray != null) {
250             for (final byte b : alwaysEncodeCharsArray) {
251                 insertAlwaysEncodeChar(b);
252             }
253         }
254         insertAlwaysEncodeChar(ESCAPE_CHAR);
255     }
256 
257     private boolean isAsciiChar(final byte c) {
258         return c >= 0;
259     }
260 }