001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.vfs2.util; 018 019import java.io.ByteArrayOutputStream; 020import java.io.UnsupportedEncodingException; 021import java.nio.charset.Charset; 022import java.nio.charset.StandardCharsets; 023 024import org.apache.commons.lang3.StringUtils; 025import org.apache.commons.lang3.util.FluentBitSet; 026import org.apache.commons.logging.Log; 027import org.apache.commons.logging.LogFactory; 028import org.apache.commons.vfs2.provider.GenericURLFileName; 029 030/** 031 * The URI escape and character encoding and decoding utility. 032 * <p> 033 * This was forked from some needed methods such as {@code #encodePath(...)} in {@code org.apache.commons.httpclient.util.URIUtil}, 034 * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s, 035 * but it should work with any different HTTP backend provider implementations. 036 * </p> 037 */ 038public class URIUtils { 039 040 /** 041 * Internal character encoding utilities. 042 * <p> 043 * This was forked from some needed methods such as {@code #getBytes(...)} and {@code #getAsciiString(...)} 044 * in {@code org.apache.commons.httpclient.util.EncodingUtil}, 045 * in order to not be dependent on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s, 046 * but it should work with any different HTTP backend provider implementations. 047 * </p> 048 */ 049 private static final class EncodingUtils { 050 051 /** 052 * Converts the byte array of ASCII characters to a string. This method is 053 * to be used when decoding content of HTTP elements (such as response 054 * headers) 055 * 056 * @param data the byte array to be encoded 057 * @param offset the index of the first byte to encode 058 * @param length the number of bytes to encode 059 * @return The string representation of the byte array 060 */ 061 static String getAsciiString(final byte[] data, final int offset, final int length) { 062 return new String(data, offset, length, StandardCharsets.US_ASCII); 063 } 064 065 /** 066 * Converts the specified string to a byte array. If the charset is not supported the 067 * default system charset is used. 068 * 069 * @param data the string to be encoded 070 * @param charsetName the desired character encoding 071 * @return The resulting byte array. 072 */ 073 static byte[] getBytes(final String data, final String charsetName) { 074 if (data == null) { 075 throw new IllegalArgumentException("data may not be null"); 076 } 077 078 if (StringUtils.isEmpty(charsetName)) { 079 throw new IllegalArgumentException("charset may not be null or empty"); 080 } 081 082 try { 083 return data.getBytes(charsetName); 084 } catch (final UnsupportedEncodingException e) { 085 086 if (LOG.isWarnEnabled()) { 087 LOG.warn("Unsupported encoding: " + charsetName + ". System encoding used."); 088 } 089 090 return data.getBytes(Charset.defaultCharset()); 091 } 092 } 093 094 private EncodingUtils() { 095 } 096 } 097 098 /** 099 * Internal URL codec utilities. 100 * <p> 101 * This was forked from some needed methods such as {@code #encodeUrl(...)} and {@code #hexDigit(int)} 102 * in {@code org.apache.commons.codec.net.URLCodec}, as commons-codec library cannot be pulled in transitively 103 * via HTTP Client v3 library any more. 104 * </p> 105 */ 106 private static final class URLCodecUtils { 107 108 private static final byte ESCAPE_CHAR = '%'; 109 110 private static final int EIGHT_BIT_CHARSET_SIZE = 256; 111 112 private static final int FOUR_BITS = 4; 113 114 private static final int UNSIGNED_BYTE_MASK = 0xF; 115 116 // @formatter:off 117 private static final FluentBitSet WWW_FORM_URL_SAFE = URIBitSets.bitSet() 118 // alpha characters 119 .setInclusive('a', 'z') 120 .setInclusive('A', 'Z') 121 // numeric characters 122 .setInclusive('0', '9') 123 // special chars 124 .set('-', '_', '.', '*') 125 // blank to be replaced with + 126 .set(' '); 127 // @formatter:on 128 129 /** 130 * Radix used in encoding and decoding. 131 */ 132 private static final int RADIX = 16; 133 134 static byte[] encodeUrl(FluentBitSet urlsafe, final byte[] bytes) { 135 if (bytes == null) { 136 return null; 137 } 138 if (urlsafe == null) { 139 urlsafe = WWW_FORM_URL_SAFE; 140 } 141 142 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 143 for (final byte c : bytes) { 144 int b = c; 145 if (b < 0) { 146 b = EIGHT_BIT_CHARSET_SIZE + b; 147 } 148 if (urlsafe.get(b)) { 149 if (b == ' ') { 150 b = '+'; 151 } 152 buffer.write(b); 153 } else { 154 buffer.write(ESCAPE_CHAR); 155 final char hex1 = hexDigit(b >> FOUR_BITS); 156 final char hex2 = hexDigit(b); 157 buffer.write(hex1); 158 buffer.write(hex2); 159 } 160 } 161 return buffer.toByteArray(); 162 } 163 164 private static char hexDigit(final int b) { 165 return Character.toUpperCase(Character.forDigit(b & UNSIGNED_BYTE_MASK, RADIX)); 166 } 167 168 private URLCodecUtils() { 169 } 170 } 171 172 private static final Log LOG = LogFactory.getLog(URIUtils.class); 173 174 /** 175 * The default charset of the protocol. RFC 2277, 2396 176 */ 177 private static final String DEFAULT_PROTOCOL_CHARSET = StandardCharsets.UTF_8.name(); 178 179 private static String encode(final String unescaped, final FluentBitSet allowed, final String charset) { 180 final byte[] rawdata = URLCodecUtils.encodeUrl(allowed, EncodingUtils.getBytes(unescaped, charset)); 181 return EncodingUtils.getAsciiString(rawdata, 0, rawdata.length); 182 } 183 184 /** 185 * Escape and encode a string regarded as the path component of an URI with 186 * the default protocol charset. 187 * 188 * @param unescaped an unescaped string 189 * @return the escaped string 190 */ 191 public static String encodePath(final String unescaped) { 192 return encodePath(unescaped, DEFAULT_PROTOCOL_CHARSET); 193 } 194 195 /** 196 * Escape and encode a string regarded as the path component of an URI with 197 * a given charset. 198 * 199 * @param unescaped an unescaped string 200 * @param charset the charset 201 * @return the escaped string 202 */ 203 public static String encodePath(final String unescaped, final String charset) { 204 if (unescaped == null) { 205 throw new IllegalArgumentException("The string to encode may not be null."); 206 } 207 208 return encode(unescaped, URIBitSets.ALLOWED_ABS_PATH, charset); 209 } 210 211 private URIUtils() { 212 } 213 214}