View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.vfs2.util;
18  
19  import java.io.ByteArrayOutputStream;
20  import java.io.UnsupportedEncodingException;
21  import java.net.URISyntaxException;
22  import java.util.BitSet;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.commons.vfs2.provider.GenericURLFileName;
27  
28  /**
29   * The URI escape and character encoding and decoding utility.
30   * <p>
31   * This was forked from some needed methods such as <code>#encodePath(...)</code> in <code>org.apache.commons.httpclient.util.URIUtil</code>,
32   * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
33   * but it should work with any different HTTP backend provider implementations.
34   * </p>
35   */
36  public class URIUtils {
37  
38      private static final Log LOG = LogFactory.getLog(URIUtils.class);
39  
40      /**
41       * The default charset of the protocol.  RFC 2277, 2396
42       */
43      private static final String DEFAULT_PROTOCOL_CHARSET = "UTF-8";
44  
45      private URIUtils() {
46      }
47  
48      /**
49       * Escape and encode a string regarded as the path component of an URI with
50       * the default protocol charset.
51       *
52       * @param unescaped an unescaped string
53       * @return the escaped string
54       *
55       * @throws URISyntaxException if the default protocol charset is not supported
56       */
57      public static String encodePath(final String unescaped) throws URISyntaxException {
58          return encodePath(unescaped, DEFAULT_PROTOCOL_CHARSET);
59      }
60  
61      /**
62       * Escape and encode a string regarded as the path component of an URI with
63       * a given charset.
64       *
65       * @param unescaped an unescaped string
66       * @param charset the charset
67       * @return the escaped string
68       *
69       * @throws URISyntaxException if the charset is not supported
70       */
71      public static String encodePath(final String unescaped, final String charset) throws URISyntaxException {
72          if (unescaped == null) {
73              throw new IllegalArgumentException("The string to encode may not be null.");
74          }
75  
76          return encode(unescaped, URIBitSets.allowed_abs_path, charset);
77      }
78  
79      private static String encode(final String unescaped, final BitSet allowed, final String charset) throws URISyntaxException {
80          final byte[] rawdata = URLCodecUtils.encodeUrl(allowed, EncodingUtils.getBytes(unescaped, charset));
81          return EncodingUtils.getAsciiString(rawdata, 0, rawdata.length);
82      }
83  
84      /**
85       * Internal URL codec utilities.
86       * <p>
87       * This was forked from some needed methods such as <code>#encodeUrl(...)</code> and <code>#hexDigit(int)</code>
88       * in <code>org.apache.commons.codec.net.URLCodec</code>, as commons-codec library cannot be pulled in transitively
89       * via Http Client v3 library any more.
90       * </p>
91       */
92      private static class URLCodecUtils {
93  
94          private static final byte ESCAPE_CHAR = '%';
95  
96          private static final BitSet WWW_FORM_URL_SAFE = new BitSet(256);
97  
98          // Static initializer for www_form_url
99          static {
100             // alpha characters
101             for (int i = 'a'; i <= 'z'; i++) {
102                 WWW_FORM_URL_SAFE.set(i);
103             }
104             for (int i = 'A'; i <= 'Z'; i++) {
105                 WWW_FORM_URL_SAFE.set(i);
106             }
107             // numeric characters
108             for (int i = '0'; i <= '9'; i++) {
109                 WWW_FORM_URL_SAFE.set(i);
110             }
111             // special chars
112             WWW_FORM_URL_SAFE.set('-');
113             WWW_FORM_URL_SAFE.set('_');
114             WWW_FORM_URL_SAFE.set('.');
115             WWW_FORM_URL_SAFE.set('*');
116             // blank to be replaced with +
117             WWW_FORM_URL_SAFE.set(' ');
118         }
119 
120         /**
121          * Radix used in encoding and decoding.
122          */
123         private static final int RADIX = 16;
124 
125         private URLCodecUtils() {
126         }
127 
128         static final byte[] encodeUrl(BitSet urlsafe, final byte[] bytes) {
129             if (bytes == null) {
130                 return null;
131             }
132             if (urlsafe == null) {
133                 urlsafe = WWW_FORM_URL_SAFE;
134             }
135 
136             final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
137             for (final byte c : bytes) {
138                 int b = c;
139                 if (b < 0) {
140                     b = 256 + b;
141                 }
142                 if (urlsafe.get(b)) {
143                     if (b == ' ') {
144                         b = '+';
145                     }
146                     buffer.write(b);
147                 } else {
148                     buffer.write(ESCAPE_CHAR);
149                     final char hex1 = hexDigit(b >> 4);
150                     final char hex2 = hexDigit(b);
151                     buffer.write(hex1);
152                     buffer.write(hex2);
153                 }
154             }
155             return buffer.toByteArray();
156         }
157 
158         private static char hexDigit(final int b) {
159             return Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
160         }
161     }
162 
163     /**
164      * Internal character encoding utilities.
165      * <p>
166      * This was forked from some needed methods such as <code>#getBytes(...)</code> and <code>#getAsciiString(...)</code>
167      * in <code>org.apache.commons.httpclient.util.EncodingUtil</code>,
168      * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
169      * but it should work with any different HTTP backend provider implementations.
170      * </p>
171      */
172     private static class EncodingUtils {
173 
174         private EncodingUtils() {
175         }
176 
177         /**
178          * Converts the specified string to a byte array.  If the charset is not supported the
179          * default system charset is used.
180          *
181          * @param data the string to be encoded
182          * @param charset the desired character encoding
183          * @return The resulting byte array.
184          */
185         static byte[] getBytes(final String data, final String charset) {
186             if (data == null) {
187                 throw new IllegalArgumentException("data may not be null");
188             }
189 
190             if (charset == null || charset.length() == 0) {
191                 throw new IllegalArgumentException("charset may not be null or empty");
192             }
193 
194             try {
195                 return data.getBytes(charset);
196             } catch (final UnsupportedEncodingException e) {
197 
198                 if (LOG.isWarnEnabled()) {
199                     LOG.warn("Unsupported encoding: " + charset + ". System encoding used.");
200                 }
201 
202                 return data.getBytes();
203             }
204         }
205 
206         /**
207          * Converts the byte array of ASCII characters to a string. This method is
208          * to be used when decoding content of HTTP elements (such as response
209          * headers)
210          *
211          * @param data the byte array to be encoded
212          * @param offset the index of the first byte to encode
213          * @param length the number of bytes to encode
214          * @return The string representation of the byte array
215          */
216         static String getAsciiString(final byte[] data, final int offset, final int length) {
217             try {
218                 return new String(data, offset, length, "US-ASCII");
219             } catch (final UnsupportedEncodingException e) {
220                 throw new RuntimeException("US-ASCII charset is not supported.");
221             }
222         }
223     }
224 
225 }