View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.vfs2.util;
18  
19  import java.io.ByteArrayOutputStream;
20  import java.io.UnsupportedEncodingException;
21  import java.nio.charset.Charset;
22  import java.nio.charset.StandardCharsets;
23  import java.util.BitSet;
24  
25  import org.apache.commons.lang3.StringUtils;
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.commons.vfs2.provider.GenericURLFileName;
29  
30  /**
31   * The URI escape and character encoding and decoding utility.
32   * <p>
33   * This was forked from some needed methods such as {@code #encodePath(...)} in {@code org.apache.commons.httpclient.util.URIUtil},
34   * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
35   * but it should work with any different HTTP backend provider implementations.
36   * </p>
37   */
38  public class URIUtils {
39  
40      /**
41       * Internal character encoding utilities.
42       * <p>
43       * This was forked from some needed methods such as {@code #getBytes(...)} and {@code #getAsciiString(...)}
44       * in {@code org.apache.commons.httpclient.util.EncodingUtil},
45       * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
46       * but it should work with any different HTTP backend provider implementations.
47       * </p>
48       */
49      private static class EncodingUtils {
50  
51          /**
52           * Converts the byte array of ASCII characters to a string. This method is
53           * to be used when decoding content of HTTP elements (such as response
54           * headers)
55           *
56           * @param data the byte array to be encoded
57           * @param offset the index of the first byte to encode
58           * @param length the number of bytes to encode
59           * @return The string representation of the byte array
60           */
61          static String getAsciiString(final byte[] data, final int offset, final int length) {
62              return new String(data, offset, length, StandardCharsets.US_ASCII);
63          }
64  
65          /**
66           * Converts the specified string to a byte array.  If the charset is not supported the
67           * default system charset is used.
68           *
69           * @param data the string to be encoded
70           * @param charsetName the desired character encoding
71           * @return The resulting byte array.
72           */
73          static byte[] getBytes(final String data, final String charsetName) {
74              if (data == null) {
75                  throw new IllegalArgumentException("data may not be null");
76              }
77  
78              if (StringUtils.isEmpty(charsetName)) {
79                  throw new IllegalArgumentException("charset may not be null or empty");
80              }
81  
82              try {
83                  return data.getBytes(charsetName);
84              } catch (final UnsupportedEncodingException e) {
85  
86                  if (LOG.isWarnEnabled()) {
87                      LOG.warn("Unsupported encoding: " + charsetName + ". System encoding used.");
88                  }
89  
90                  return data.getBytes(Charset.defaultCharset());
91              }
92          }
93  
94          private EncodingUtils() {
95          }
96      }
97  
98      /**
99       * Internal URL codec utilities.
100      * <p>
101      * This was forked from some needed methods such as {@code #encodeUrl(...)} and {@code #hexDigit(int)}
102      * in {@code org.apache.commons.codec.net.URLCodec}, as commons-codec library cannot be pulled in transitively
103      * via Http Client v3 library any more.
104      * </p>
105      */
106     private static class URLCodecUtils {
107 
108         private static final byte ESCAPE_CHAR = '%';
109 
110         private static final BitSet WWW_FORM_URL_SAFE = new BitSet(256);
111 
112         // Static initializer for www_form_url
113         static {
114             // alpha characters
115             for (int i = 'a'; i <= 'z'; i++) {
116                 WWW_FORM_URL_SAFE.set(i);
117             }
118             for (int i = 'A'; i <= 'Z'; i++) {
119                 WWW_FORM_URL_SAFE.set(i);
120             }
121             // numeric characters
122             for (int i = '0'; i <= '9'; i++) {
123                 WWW_FORM_URL_SAFE.set(i);
124             }
125             // special chars
126             WWW_FORM_URL_SAFE.set('-');
127             WWW_FORM_URL_SAFE.set('_');
128             WWW_FORM_URL_SAFE.set('.');
129             WWW_FORM_URL_SAFE.set('*');
130             // blank to be replaced with +
131             WWW_FORM_URL_SAFE.set(' ');
132         }
133 
134         /**
135          * Radix used in encoding and decoding.
136          */
137         private static final int RADIX = 16;
138 
139         static final byte[] encodeUrl(BitSet urlsafe, final byte[] bytes) {
140             if (bytes == null) {
141                 return null;
142             }
143             if (urlsafe == null) {
144                 urlsafe = WWW_FORM_URL_SAFE;
145             }
146 
147             final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
148             for (final byte c : bytes) {
149                 int b = c;
150                 if (b < 0) {
151                     b = 256 + b;
152                 }
153                 if (urlsafe.get(b)) {
154                     if (b == ' ') {
155                         b = '+';
156                     }
157                     buffer.write(b);
158                 } else {
159                     buffer.write(ESCAPE_CHAR);
160                     final char hex1 = hexDigit(b >> 4);
161                     final char hex2 = hexDigit(b);
162                     buffer.write(hex1);
163                     buffer.write(hex2);
164                 }
165             }
166             return buffer.toByteArray();
167         }
168 
169         private static char hexDigit(final int b) {
170             return Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
171         }
172 
173         private URLCodecUtils() {
174         }
175     }
176 
177     private static final Log LOG = LogFactory.getLog(URIUtils.class);
178 
179     /**
180      * The default charset of the protocol.  RFC 2277, 2396
181      */
182     private static final String DEFAULT_PROTOCOL_CHARSET = "UTF-8";
183 
184     private static String encode(final String unescaped, final BitSet allowed, final String charset) {
185         final byte[] rawdata = URLCodecUtils.encodeUrl(allowed, EncodingUtils.getBytes(unescaped, charset));
186         return EncodingUtils.getAsciiString(rawdata, 0, rawdata.length);
187     }
188 
189     /**
190      * Escape and encode a string regarded as the path component of an URI with
191      * the default protocol charset.
192      *
193      * @param unescaped an unescaped string
194      * @return the escaped string
195      */
196     public static String encodePath(final String unescaped) {
197         return encodePath(unescaped, DEFAULT_PROTOCOL_CHARSET);
198     }
199 
200     /**
201      * Escape and encode a string regarded as the path component of an URI with
202      * a given charset.
203      *
204      * @param unescaped an unescaped string
205      * @param charset the charset
206      * @return the escaped string
207      */
208     public static String encodePath(final String unescaped, final String charset) {
209         if (unescaped == null) {
210             throw new IllegalArgumentException("The string to encode may not be null.");
211         }
212 
213         return encode(unescaped, URIBitSets.allowed_abs_path, charset);
214     }
215 
216     private URIUtils() {
217     }
218 
219 }