1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.vfs2.util;
18
19 import java.io.ByteArrayOutputStream;
20 import java.io.UnsupportedEncodingException;
21 import java.nio.charset.Charset;
22 import java.nio.charset.StandardCharsets;
23
24 import org.apache.commons.lang3.StringUtils;
25 import org.apache.commons.lang3.util.FluentBitSet;
26 import org.apache.commons.logging.Log;
27 import org.apache.commons.logging.LogFactory;
28 import org.apache.commons.vfs2.provider.GenericURLFileName;
29
30 /**
31 * The URI escape and character encoding and decoding utility.
32 * <p>
33 * This was forked from some needed methods such as {@code #encodePath(...)} in {@code org.apache.commons.httpclient.util.URIUtil},
34 * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
35 * but it should work with any different HTTP backend provider implementations.
36 * </p>
37 */
38 public class URIUtils {
39
40 /**
41 * Internal character encoding utilities.
42 * <p>
43 * This was forked from some needed methods such as {@code #getBytes(...)} and {@code #getAsciiString(...)}
44 * in {@code org.apache.commons.httpclient.util.EncodingUtil},
45 * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s,
46 * but it should work with any different HTTP backend provider implementations.
47 * </p>
48 */
49 private static final class EncodingUtils {
50
51 /**
52 * Converts the byte array of ASCII characters to a string. This method is
53 * to be used when decoding content of HTTP elements (such as response
54 * headers)
55 *
56 * @param data the byte array to be encoded
57 * @param offset the index of the first byte to encode
58 * @param length the number of bytes to encode
59 * @return The string representation of the byte array
60 */
61 static String getAsciiString(final byte[] data, final int offset, final int length) {
62 return new String(data, offset, length, StandardCharsets.US_ASCII);
63 }
64
65 /**
66 * Converts the specified string to a byte array. If the charset is not supported the
67 * default system charset is used.
68 *
69 * @param data the string to be encoded
70 * @param charsetName the desired character encoding
71 * @return The resulting byte array.
72 */
73 static byte[] getBytes(final String data, final String charsetName) {
74 if (data == null) {
75 throw new IllegalArgumentException("data may not be null");
76 }
77
78 if (StringUtils.isEmpty(charsetName)) {
79 throw new IllegalArgumentException("charset may not be null or empty");
80 }
81
82 try {
83 return data.getBytes(charsetName);
84 } catch (final UnsupportedEncodingException e) {
85
86 if (LOG.isWarnEnabled()) {
87 LOG.warn("Unsupported encoding: " + charsetName + ". System encoding used.");
88 }
89
90 return data.getBytes(Charset.defaultCharset());
91 }
92 }
93
94 private EncodingUtils() {
95 }
96 }
97
98 /**
99 * Internal URL codec utilities.
100 * <p>
101 * This was forked from some needed methods such as {@code #encodeUrl(...)} and {@code #hexDigit(int)}
102 * in {@code org.apache.commons.codec.net.URLCodec}, as commons-codec library cannot be pulled in transitively
103 * via HTTP Client v3 library any more.
104 * </p>
105 */
106 private static final class URLCodecUtils {
107
108 private static final byte ESCAPE_CHAR = '%';
109
110 private static final int EIGHT_BIT_CHARSET_SIZE = 256;
111
112 private static final int FOUR_BITS = 4;
113
114 private static final int UNSIGNED_BYTE_MASK = 0xF;
115
116 // @formatter:off
117 private static final FluentBitSet WWW_FORM_URL_SAFE = URIBitSets.bitSet()
118 // alpha characters
119 .setInclusive('a', 'z')
120 .setInclusive('A', 'Z')
121 // numeric characters
122 .setInclusive('0', '9')
123 // special chars
124 .set('-', '_', '.', '*')
125 // blank to be replaced with +
126 .set(' ');
127 // @formatter:on
128
129 /**
130 * Radix used in encoding and decoding.
131 */
132 private static final int RADIX = 16;
133
134 static byte[] encodeUrl(FluentBitSet urlsafe, final byte[] bytes) {
135 if (bytes == null) {
136 return null;
137 }
138 if (urlsafe == null) {
139 urlsafe = WWW_FORM_URL_SAFE;
140 }
141
142 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
143 for (final byte c : bytes) {
144 int b = c;
145 if (b < 0) {
146 b = EIGHT_BIT_CHARSET_SIZE + b;
147 }
148 if (urlsafe.get(b)) {
149 if (b == ' ') {
150 b = '+';
151 }
152 buffer.write(b);
153 } else {
154 buffer.write(ESCAPE_CHAR);
155 final char hex1 = hexDigit(b >> FOUR_BITS);
156 final char hex2 = hexDigit(b);
157 buffer.write(hex1);
158 buffer.write(hex2);
159 }
160 }
161 return buffer.toByteArray();
162 }
163
164 private static char hexDigit(final int b) {
165 return Character.toUpperCase(Character.forDigit(b & UNSIGNED_BYTE_MASK, RADIX));
166 }
167
168 private URLCodecUtils() {
169 }
170 }
171
172 private static final Log LOG = LogFactory.getLog(URIUtils.class);
173
174 /**
175 * The default charset of the protocol. RFC 2277, 2396
176 */
177 private static final String DEFAULT_PROTOCOL_CHARSET = StandardCharsets.UTF_8.name();
178
179 private static String encode(final String unescaped, final FluentBitSet allowed, final String charset) {
180 final byte[] rawdata = URLCodecUtils.encodeUrl(allowed, EncodingUtils.getBytes(unescaped, charset));
181 return EncodingUtils.getAsciiString(rawdata, 0, rawdata.length);
182 }
183
184 /**
185 * Escape and encode a string regarded as the path component of an URI with
186 * the default protocol charset.
187 *
188 * @param unescaped an unescaped string
189 * @return the escaped string
190 */
191 public static String encodePath(final String unescaped) {
192 return encodePath(unescaped, DEFAULT_PROTOCOL_CHARSET);
193 }
194
195 /**
196 * Escape and encode a string regarded as the path component of an URI with
197 * a given charset.
198 *
199 * @param unescaped an unescaped string
200 * @param charset the charset
201 * @return the escaped string
202 */
203 public static String encodePath(final String unescaped, final String charset) {
204 if (unescaped == null) {
205 throw new IllegalArgumentException("The string to encode may not be null.");
206 }
207
208 return encode(unescaped, URIBitSets.ALLOWED_ABS_PATH, charset);
209 }
210
211 private URIUtils() {
212 }
213
214 }