001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.io.ByteArrayOutputStream;
021import java.io.UnsupportedEncodingException;
022import java.util.BitSet;
023
024import org.apache.commons.codec.BinaryDecoder;
025import org.apache.commons.codec.BinaryEncoder;
026import org.apache.commons.codec.CharEncoding;
027import org.apache.commons.codec.DecoderException;
028import org.apache.commons.codec.EncoderException;
029import org.apache.commons.codec.StringDecoder;
030import org.apache.commons.codec.StringEncoder;
031import org.apache.commons.codec.binary.StringUtils;
032
033/**
034 * Implements the 'www-form-urlencoded' encoding scheme, also misleadingly known as URL encoding.
035 * <p>
036 * This codec is meant to be a replacement for standard Java classes {@link java.net.URLEncoder} and
037 * {@link java.net.URLDecoder} on older Java platforms, as these classes in Java versions below
038 * 1.4 rely on the platform's default charset encoding.
039 * </p>
040 * <p>
041 * This class is thread-safe as of 1.11
042 * </p>
043 *
044 * @see <a href="http://www.w3.org/TR/html4/interact/forms.html#h-17.13.4.1">Chapter 17.13.4 Form content types</a>
045 *           of the <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>
046 *
047 * @since 1.2
048 */
049public class URLCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
050
051    /**
052     * Release 1.5 made this field final.
053     */
054    protected static final byte ESCAPE_CHAR = '%';
055
056    /**
057     * BitSet of www-form-url safe characters.
058     * This is a copy of the internal BitSet which is now used for the conversion.
059     * Changes to this field are ignored.
060     * @deprecated 1.11 Will be removed in 2.0 (CODEC-230)
061     */
062    @Deprecated
063    protected static final BitSet WWW_FORM_URL;
064
065    private static final BitSet WWW_FORM_URL_SAFE = new BitSet(256);
066
067    // Static initializer for www_form_url
068    static {
069        // alpha characters
070        for (int i = 'a'; i <= 'z'; i++) {
071            WWW_FORM_URL_SAFE.set(i);
072        }
073        for (int i = 'A'; i <= 'Z'; i++) {
074            WWW_FORM_URL_SAFE.set(i);
075        }
076        // numeric characters
077        for (int i = '0'; i <= '9'; i++) {
078            WWW_FORM_URL_SAFE.set(i);
079        }
080        // special chars
081        WWW_FORM_URL_SAFE.set('-');
082        WWW_FORM_URL_SAFE.set('_');
083        WWW_FORM_URL_SAFE.set('.');
084        WWW_FORM_URL_SAFE.set('*');
085        // blank to be replaced with +
086        WWW_FORM_URL_SAFE.set(' ');
087
088        // Create a copy in case anyone (ab)uses it
089        WWW_FORM_URL = (BitSet) WWW_FORM_URL_SAFE.clone();
090    }
091
092    /**
093     * Decodes an array of URL safe 7-bit characters into an array of original bytes. Escaped characters are converted
094     * back to their original representation.
095     *
096     * @param bytes
097     *            array of URL safe characters
098     * @return array of original bytes
099     * @throws DecoderException
100     *             Thrown if URL decoding is unsuccessful
101     */
102    public static final byte[] decodeUrl(final byte[] bytes) throws DecoderException {
103        if (bytes == null) {
104            return null;
105        }
106        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
107        for (int i = 0; i < bytes.length; i++) {
108            final int b = bytes[i];
109            if (b == '+') {
110                buffer.write(' ');
111            } else if (b == ESCAPE_CHAR) {
112                try {
113                    final int u = Utils.digit16(bytes[++i]);
114                    final int l = Utils.digit16(bytes[++i]);
115                    buffer.write((char) ((u << 4) + l));
116                } catch (final ArrayIndexOutOfBoundsException e) {
117                    throw new DecoderException("Invalid URL encoding: ", e);
118                }
119            } else {
120                buffer.write(b);
121            }
122        }
123        return buffer.toByteArray();
124    }
125
126    /**
127     * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
128     *
129     * @param urlsafe
130     *            bitset of characters deemed URL safe
131     * @param bytes
132     *            array of bytes to convert to URL safe characters
133     * @return array of bytes containing URL safe characters
134     */
135    public static final byte[] encodeUrl(BitSet urlsafe, final byte[] bytes) {
136        if (bytes == null) {
137            return null;
138        }
139        if (urlsafe == null) {
140            urlsafe = WWW_FORM_URL_SAFE;
141        }
142
143        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
144        for (final byte c : bytes) {
145            int b = c;
146            if (b < 0) {
147                b = 256 + b;
148            }
149            if (urlsafe.get(b)) {
150                if (b == ' ') {
151                    b = '+';
152                }
153                buffer.write(b);
154            } else {
155                buffer.write(ESCAPE_CHAR);
156                final char hex1 = Utils.hexDigit(b >> 4);
157                final char hex2 = Utils.hexDigit(b);
158                buffer.write(hex1);
159                buffer.write(hex2);
160            }
161        }
162        return buffer.toByteArray();
163    }
164
165    /**
166     * The default charset used for string decoding and encoding.
167     *
168     * @deprecated TODO: This field will be changed to a private final Charset in 2.0. (CODEC-126)
169     */
170    @Deprecated
171    protected volatile String charset; // added volatile: see CODEC-232
172
173    /**
174     * Default constructor.
175     */
176    public URLCodec() {
177        this(CharEncoding.UTF_8);
178    }
179
180    /**
181     * Constructor which allows for the selection of a default charset.
182     *
183     * @param charset the default string charset to use.
184     */
185    public URLCodec(final String charset) {
186        this.charset = charset;
187    }
188
189    /**
190     * Decodes an array of URL safe 7-bit characters into an array of original bytes. Escaped characters are converted
191     * back to their original representation.
192     *
193     * @param bytes
194     *            array of URL safe characters
195     * @return array of original bytes
196     * @throws DecoderException
197     *             Thrown if URL decoding is unsuccessful
198     */
199    @Override
200    public byte[] decode(final byte[] bytes) throws DecoderException {
201        return decodeUrl(bytes);
202    }
203
204    /**
205     * Decodes a URL safe object into its original form. Escaped characters are converted back to their original
206     * representation.
207     *
208     * @param obj
209     *            URL safe object to convert into its original form
210     * @return original object
211     * @throws DecoderException
212     *             Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
213     *             condition is encountered during the decode process.
214     */
215    @Override
216    public Object decode(final Object obj) throws DecoderException {
217        if (obj == null) {
218            return null;
219        }
220        if (obj instanceof byte[]) {
221            return decode((byte[]) obj);
222        }
223        if (obj instanceof String) {
224            return decode((String) obj);
225        }
226        throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be URL decoded");
227    }
228
229    /**
230     * Decodes a URL safe string into its original form using the default string charset. Escaped characters are
231     * converted back to their original representation.
232     *
233     * @param str
234     *            URL safe string to convert into its original form
235     * @return original string
236     * @throws DecoderException
237     *             Thrown if URL decoding is unsuccessful
238     * @see #getDefaultCharset()
239     */
240    @Override
241    public String decode(final String str) throws DecoderException {
242        if (str == null) {
243            return null;
244        }
245        try {
246            return decode(str, getDefaultCharset());
247        } catch (final UnsupportedEncodingException e) {
248            throw new DecoderException(e.getMessage(), e);
249        }
250    }
251
252    /**
253     * Decodes a URL safe string into its original form using the specified encoding. Escaped characters are converted
254     * back to their original representation.
255     *
256     * @param str
257     *            URL safe string to convert into its original form
258     * @param charsetName
259     *            the original string charset
260     * @return original string
261     * @throws DecoderException
262     *             Thrown if URL decoding is unsuccessful
263     * @throws UnsupportedEncodingException
264     *             Thrown if charset is not supported
265     */
266    public String decode(final String str, final String charsetName)
267            throws DecoderException, UnsupportedEncodingException {
268        if (str == null) {
269            return null;
270        }
271        return new String(decode(StringUtils.getBytesUsAscii(str)), charsetName);
272    }
273
274    /**
275     * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
276     *
277     * @param bytes
278     *            array of bytes to convert to URL safe characters
279     * @return array of bytes containing URL safe characters
280     */
281    @Override
282    public byte[] encode(final byte[] bytes) {
283        return encodeUrl(WWW_FORM_URL_SAFE, bytes);
284    }
285
286    /**
287     * Encodes an object into its URL safe form. Unsafe characters are escaped.
288     *
289     * @param obj
290     *            string to convert to a URL safe form
291     * @return URL safe object
292     * @throws EncoderException
293     *             Thrown if URL encoding is not applicable to objects of this type or if encoding is unsuccessful
294     */
295    @Override
296    public Object encode(final Object obj) throws EncoderException {
297        if (obj == null) {
298            return null;
299        }
300        if (obj instanceof byte[]) {
301            return encode((byte[]) obj);
302        }
303        if (obj instanceof String) {
304            return encode((String) obj);
305        }
306        throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be URL encoded");
307    }
308
309    /**
310     * Encodes a string into its URL safe form using the default string charset. Unsafe characters are escaped.
311     *
312     * @param str
313     *            string to convert to a URL safe form
314     * @return URL safe string
315     * @throws EncoderException
316     *             Thrown if URL encoding is unsuccessful
317     *
318     * @see #getDefaultCharset()
319     */
320    @Override
321    public String encode(final String str) throws EncoderException {
322        if (str == null) {
323            return null;
324        }
325        try {
326            return encode(str, getDefaultCharset());
327        } catch (final UnsupportedEncodingException e) {
328            throw new EncoderException(e.getMessage(), e);
329        }
330    }
331
332    /**
333     * Encodes a string into its URL safe form using the specified string charset. Unsafe characters are escaped.
334     *
335     * @param str
336     *            string to convert to a URL safe form
337     * @param charsetName
338     *            the charset for str
339     * @return URL safe string
340     * @throws UnsupportedEncodingException
341     *             Thrown if charset is not supported
342     */
343    public String encode(final String str, final String charsetName) throws UnsupportedEncodingException {
344        if (str == null) {
345            return null;
346        }
347        return StringUtils.newStringUsAscii(encode(str.getBytes(charsetName)));
348    }
349
350    /**
351     * The default charset used for string decoding and encoding.
352     *
353     * @return the default string charset.
354     */
355    public String getDefaultCharset() {
356        return this.charset;
357    }
358
359    /**
360     * The {@code String} encoding used for decoding and encoding.
361     *
362     * @return Returns the encoding.
363     *
364     * @deprecated Use {@link #getDefaultCharset()}, will be removed in 2.0.
365     */
366    @Deprecated
367    public String getEncoding() {
368        return this.charset;
369    }
370
371}