View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.vfs2.util;
18  
19  import java.io.ByteArrayOutputStream;
20  import java.io.UnsupportedEncodingException;
21  import java.nio.charset.Charset;
22  import java.nio.charset.StandardCharsets;
23  
24  import org.apache.commons.lang3.StringUtils;
25  import org.apache.commons.lang3.util.FluentBitSet;
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.commons.vfs2.provider.GenericURLFileName;
29  
30  /**
31   * The URI escape and character encoding and decoding utility.
32   * <p>
33   * This was forked from some needed methods such as {@code #encodePath(...)} in {@code org.apache.commons.httpclient.util.URIUtil},
34   * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
35   * but it should work with any different HTTP backend provider implementations.
36   * </p>
37   */
38  public class URIUtils {
39  
40      /**
41       * Internal character encoding utilities.
42       * <p>
43       * This was forked from some needed methods such as {@code #getBytes(...)} and {@code #getAsciiString(...)}
44       * in {@code org.apache.commons.httpclient.util.EncodingUtil},
45       * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
46       * but it should work with any different HTTP backend provider implementations.
47       * </p>
48       */
49      private static final class EncodingUtils {
50  
51          /**
52           * Converts the byte array of ASCII characters to a string. This method is
53           * to be used when decoding content of HTTP elements (such as response
54           * headers)
55           *
56           * @param data the byte array to be encoded
57           * @param offset the index of the first byte to encode
58           * @param length the number of bytes to encode
59           * @return The string representation of the byte array
60           */
61          static String getAsciiString(final byte[] data, final int offset, final int length) {
62              return new String(data, offset, length, StandardCharsets.US_ASCII);
63          }
64  
65          /**
66           * Converts the specified string to a byte array.  If the charset is not supported the
67           * default system charset is used.
68           *
69           * @param data the string to be encoded
70           * @param charsetName the desired character encoding
71           * @return The resulting byte array.
72           */
73          static byte[] getBytes(final String data, final String charsetName) {
74              if (data == null) {
75                  throw new IllegalArgumentException("data may not be null");
76              }
77  
78              if (StringUtils.isEmpty(charsetName)) {
79                  throw new IllegalArgumentException("charset may not be null or empty");
80              }
81  
82              try {
83                  return data.getBytes(charsetName);
84              } catch (final UnsupportedEncodingException e) {
85  
86                  if (LOG.isWarnEnabled()) {
87                      LOG.warn("Unsupported encoding: " + charsetName + ". System encoding used.");
88                  }
89  
90                  return data.getBytes(Charset.defaultCharset());
91              }
92          }
93  
94          private EncodingUtils() {
95          }
96      }
97  
98      /**
99       * Internal URL codec utilities.
100      * <p>
101      * This was forked from some needed methods such as {@code #encodeUrl(...)} and {@code #hexDigit(int)}
102      * in {@code org.apache.commons.codec.net.URLCodec}, as commons-codec library cannot be pulled in transitively
103      * via HTTP Client v3 library any more.
104      * </p>
105      */
106     private static final class URLCodecUtils {
107 
108         private static final byte ESCAPE_CHAR = '%';
109 
110         private static final int EIGHT_BIT_CHARSET_SIZE = 256;
111 
112         private static final int FOUR_BITS = 4;
113 
114         private static final int UNSIGNED_BYTE_MASK = 0xF;
115 
116         // @formatter:off
117         private static final FluentBitSet WWW_FORM_URL_SAFE = URIBitSets.bitSet()
118             // alpha characters
119             .setInclusive('a', 'z')
120             .setInclusive('A', 'Z')
121             // numeric characters
122             .setInclusive('0', '9')
123             // special chars
124             .set('-', '_', '.', '*')
125             // blank to be replaced with +
126             .set(' ');
127         // @formatter:on
128 
129         /**
130          * Radix used in encoding and decoding.
131          */
132         private static final int RADIX = 16;
133 
134         static byte[] encodeUrl(FluentBitSet urlsafe, final byte[] bytes) {
135             if (bytes == null) {
136                 return null;
137             }
138             if (urlsafe == null) {
139                 urlsafe = WWW_FORM_URL_SAFE;
140             }
141 
142             final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
143             for (final byte c : bytes) {
144                 int b = c;
145                 if (b < 0) {
146                     b = EIGHT_BIT_CHARSET_SIZE + b;
147                 }
148                 if (urlsafe.get(b)) {
149                     if (b == ' ') {
150                         b = '+';
151                     }
152                     buffer.write(b);
153                 } else {
154                     buffer.write(ESCAPE_CHAR);
155                     final char hex1 = hexDigit(b >> FOUR_BITS);
156                     final char hex2 = hexDigit(b);
157                     buffer.write(hex1);
158                     buffer.write(hex2);
159                 }
160             }
161             return buffer.toByteArray();
162         }
163 
164         private static char hexDigit(final int b) {
165             return Character.toUpperCase(Character.forDigit(b & UNSIGNED_BYTE_MASK, RADIX));
166         }
167 
168         private URLCodecUtils() {
169         }
170     }
171 
172     private static final Log LOG = LogFactory.getLog(URIUtils.class);
173 
174     /**
175      * The default charset of the protocol.  RFC 2277, 2396
176      */
177     private static final String DEFAULT_PROTOCOL_CHARSET = StandardCharsets.UTF_8.name();
178 
179     private static String encode(final String unescaped, final FluentBitSet allowed, final String charset) {
180         final byte[] rawdata = URLCodecUtils.encodeUrl(allowed, EncodingUtils.getBytes(unescaped, charset));
181         return EncodingUtils.getAsciiString(rawdata, 0, rawdata.length);
182     }
183 
184     /**
185      * Escape and encode a string regarded as the path component of an URI with
186      * the default protocol charset.
187      *
188      * @param unescaped an unescaped string
189      * @return the escaped string
190      */
191     public static String encodePath(final String unescaped) {
192         return encodePath(unescaped, DEFAULT_PROTOCOL_CHARSET);
193     }
194 
195     /**
196      * Escape and encode a string regarded as the path component of an URI with
197      * a given charset.
198      *
199      * @param unescaped an unescaped string
200      * @param charset the charset
201      * @return the escaped string
202      */
203     public static String encodePath(final String unescaped, final String charset) {
204         if (unescaped == null) {
205             throw new IllegalArgumentException("The string to encode may not be null.");
206         }
207 
208         return encode(unescaped, URIBitSets.ALLOWED_ABS_PATH, charset);
209     }
210 
211     private URIUtils() {
212     }
213 
214 }