001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.vfs2.util;
018
019import java.io.ByteArrayOutputStream;
020import java.io.UnsupportedEncodingException;
021import java.nio.charset.Charset;
022import java.nio.charset.StandardCharsets;
023
024import org.apache.commons.lang3.StringUtils;
025import org.apache.commons.lang3.util.FluentBitSet;
026import org.apache.commons.logging.Log;
027import org.apache.commons.logging.LogFactory;
028import org.apache.commons.vfs2.provider.GenericURLFileName;
029
030/**
031 * The URI escape and character encoding and decoding utility.
032 * <p>
033 * This was forked from some needed methods such as {@code #encodePath(...)} in {@code org.apache.commons.httpclient.util.URIUtil},
034 * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
035 * but it should work with any different HTTP backend provider implementations.
036 * </p>
037 */
038public class URIUtils {
039
040    /**
041     * Internal character encoding utilities.
042     * <p>
043     * This was forked from some needed methods such as {@code #getBytes(...)} and {@code #getAsciiString(...)}
044     * in {@code org.apache.commons.httpclient.util.EncodingUtil},
045     * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
046     * but it should work with any different HTTP backend provider implementations.
047     * </p>
048     */
049    private static final class EncodingUtils {
050
051        /**
052         * Converts the byte array of ASCII characters to a string. This method is
053         * to be used when decoding content of HTTP elements (such as response
054         * headers)
055         *
056         * @param data the byte array to be encoded
057         * @param offset the index of the first byte to encode
058         * @param length the number of bytes to encode
059         * @return The string representation of the byte array
060         */
061        static String getAsciiString(final byte[] data, final int offset, final int length) {
062            return new String(data, offset, length, StandardCharsets.US_ASCII);
063        }
064
065        /**
066         * Converts the specified string to a byte array.  If the charset is not supported the
067         * default system charset is used.
068         *
069         * @param data the string to be encoded
070         * @param charsetName the desired character encoding
071         * @return The resulting byte array.
072         */
073        static byte[] getBytes(final String data, final String charsetName) {
074            if (data == null) {
075                throw new IllegalArgumentException("data may not be null");
076            }
077
078            if (StringUtils.isEmpty(charsetName)) {
079                throw new IllegalArgumentException("charset may not be null or empty");
080            }
081
082            try {
083                return data.getBytes(charsetName);
084            } catch (final UnsupportedEncodingException e) {
085
086                if (LOG.isWarnEnabled()) {
087                    LOG.warn("Unsupported encoding: " + charsetName + ". System encoding used.");
088                }
089
090                return data.getBytes(Charset.defaultCharset());
091            }
092        }
093
094        private EncodingUtils() {
095        }
096    }
097
098    /**
099     * Internal URL codec utilities.
100     * <p>
101     * This was forked from some needed methods such as {@code #encodeUrl(...)} and {@code #hexDigit(int)}
102     * in {@code org.apache.commons.codec.net.URLCodec}, as commons-codec library cannot be pulled in transitively
103     * via HTTP Client v3 library any more.
104     * </p>
105     */
106    private static final class URLCodecUtils {
107
108        private static final byte ESCAPE_CHAR = '%';
109
110        private static final int EIGHT_BIT_CHARSET_SIZE = 256;
111
112        private static final int FOUR_BITS = 4;
113
114        private static final int UNSIGNED_BYTE_MASK = 0xF;
115
116        // @formatter:off
117        private static final FluentBitSet WWW_FORM_URL_SAFE = URIBitSets.bitSet()
118            // alpha characters
119            .setInclusive('a', 'z')
120            .setInclusive('A', 'Z')
121            // numeric characters
122            .setInclusive('0', '9')
123            // special chars
124            .set('-', '_', '.', '*')
125            // blank to be replaced with +
126            .set(' ');
127        // @formatter:on
128
129        /**
130         * Radix used in encoding and decoding.
131         */
132        private static final int RADIX = 16;
133
134        static byte[] encodeUrl(FluentBitSet urlsafe, final byte[] bytes) {
135            if (bytes == null) {
136                return null;
137            }
138            if (urlsafe == null) {
139                urlsafe = WWW_FORM_URL_SAFE;
140            }
141
142            final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
143            for (final byte c : bytes) {
144                int b = c;
145                if (b < 0) {
146                    b = EIGHT_BIT_CHARSET_SIZE + b;
147                }
148                if (urlsafe.get(b)) {
149                    if (b == ' ') {
150                        b = '+';
151                    }
152                    buffer.write(b);
153                } else {
154                    buffer.write(ESCAPE_CHAR);
155                    final char hex1 = hexDigit(b >> FOUR_BITS);
156                    final char hex2 = hexDigit(b);
157                    buffer.write(hex1);
158                    buffer.write(hex2);
159                }
160            }
161            return buffer.toByteArray();
162        }
163
164        private static char hexDigit(final int b) {
165            return Character.toUpperCase(Character.forDigit(b & UNSIGNED_BYTE_MASK, RADIX));
166        }
167
168        private URLCodecUtils() {
169        }
170    }
171
172    private static final Log LOG = LogFactory.getLog(URIUtils.class);
173
174    /**
175     * The default charset of the protocol.  RFC 2277, 2396
176     */
177    private static final String DEFAULT_PROTOCOL_CHARSET = StandardCharsets.UTF_8.name();
178
179    private static String encode(final String unescaped, final FluentBitSet allowed, final String charset) {
180        final byte[] rawdata = URLCodecUtils.encodeUrl(allowed, EncodingUtils.getBytes(unescaped, charset));
181        return EncodingUtils.getAsciiString(rawdata, 0, rawdata.length);
182    }
183
184    /**
185     * Escape and encode a string regarded as the path component of an URI with
186     * the default protocol charset.
187     *
188     * @param unescaped an unescaped string
189     * @return the escaped string
190     */
191    public static String encodePath(final String unescaped) {
192        return encodePath(unescaped, DEFAULT_PROTOCOL_CHARSET);
193    }
194
195    /**
196     * Escape and encode a string regarded as the path component of an URI with
197     * a given charset.
198     *
199     * @param unescaped an unescaped string
200     * @param charset the charset
201     * @return the escaped string
202     */
203    public static String encodePath(final String unescaped, final String charset) {
204        if (unescaped == null) {
205            throw new IllegalArgumentException("The string to encode may not be null.");
206        }
207
208        return encode(unescaped, URIBitSets.ALLOWED_ABS_PATH, charset);
209    }
210
211    private URIUtils() {
212    }
213
214}