001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator; 018 019import java.io.Serializable; 020import java.util.Arrays; 021import java.util.HashSet; 022import java.util.Set; 023import java.util.regex.Matcher; 024import java.util.regex.Pattern; 025 026import org.apache.commons.validator.routines.InetAddressValidator; 027import org.apache.commons.validator.util.Flags; 028 029/** 030 * <p>Validates URLs.</p> 031 * Behaviour of validation is modified by passing in options: 032 * <ul> 033 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 034 * component.</li> 035 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 036 * included then fragments are flagged as illegal.</li> 037 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 038 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 039 * </ul> 040 * 041 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 042 * https://javascript.internet.com. However, this validation now bears little resemblance 043 * to the php original.</p> 044 * <pre> 045 * Example of usage: 046 * Construct a UrlValidator with valid schemes of "http", and "https". 047 * 048 * String[] schemes = {"http","https"}. 049 * UrlValidator urlValidator = new UrlValidator(schemes); 050 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 051 * System.out.println("URL is valid"); 052 * } else { 053 * System.out.println("URL is invalid"); 054 * } 055 * 056 * prints "URL is invalid" 057 * If instead the default constructor is used. 058 * 059 * UrlValidator urlValidator = new UrlValidator(); 060 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 061 * System.out.println("URL is valid"); 062 * } else { 063 * System.out.println("URL is invalid"); 064 * } 065 * 066 * prints out "URL is valid" 067 * </pre> 068 * 069 * @see 070 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 071 * Uniform Resource Identifiers (URI): Generic Syntax 072 * </a> 073 * 074 * @since 1.1 075 * @deprecated Use the new UrlValidator in the routines package. This class 076 * will be removed in a future release. 077 */ 078@Deprecated 079public class UrlValidator implements Serializable { 080 081 private static final int TOP_LEVEL_MAX_LEN = 4; 082 083 private static final int TOP_LEVEL_MIN_LEN = 2; 084 085 private static final long serialVersionUID = 24137157400029593L; 086 087 /** 088 * Allows all validly formatted schemes to pass validation instead of 089 * supplying a set of valid schemes. 090 */ 091 public static final int ALLOW_ALL_SCHEMES = 1 << 0; 092 093 /** 094 * Allow two slashes in the path component of the URL. 095 */ 096 public static final int ALLOW_2_SLASHES = 1 << 1; 097 098 /** 099 * Enabling this options disallows any URL fragments. 100 */ 101 public static final int NO_FRAGMENTS = 1 << 2; 102 103 private static final String ALPHA_CHARS = "a-zA-Z"; 104 105// NOT USED private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; 106 107 private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; 108 109 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; 110 111 // Drop numeric, and "+-." for now 112 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; 113 114 private static final String ATOM = VALID_CHARS + '+'; 115 116 /** 117 * This expression derived/taken from the BNF for URI (RFC2396). 118 */ 119 private static final String URL_REGEX = 120 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; 121 // 12 3 4 5 6 7 8 9 122 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX); 123 124 /** 125 * Schema/Protocol (ie. http:, ftp:, file:, etc). 126 */ 127 private static final int PARSE_URL_SCHEME = 2; 128 129 /** 130 * Includes hostname/ip and port number. 131 */ 132 private static final int PARSE_URL_AUTHORITY = 4; 133 134 private static final int PARSE_URL_PATH = 5; 135 136 private static final int PARSE_URL_QUERY = 7; 137 138 private static final int PARSE_URL_FRAGMENT = 9; 139 140 /** 141 * Protocol (for example, http:, ftp:, https:). 142 */ 143 private static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"); 144 145 private static final String AUTHORITY_REGEX = 146 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?"; 147 // 1 2 3 4 148 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 149 150 private static final int PARSE_AUTHORITY_HOST_IP = 1; 151 152 private static final int PARSE_AUTHORITY_PORT = 2; 153 154 /** 155 * Should always be empty. 156 */ 157 private static final int PARSE_AUTHORITY_EXTRA = 3; 158 159 private static final Pattern PATH_PATTERN = Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$"); 160 161 private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$"); 162 163 private static final Pattern LEGAL_ASCII_PATTERN = Pattern.compile("^\\p{ASCII}+$"); 164 165 private static final Pattern DOMAIN_PATTERN = 166 Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$"); 167 168 private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$"); 169 170 private static final Pattern ATOM_PATTERN = Pattern.compile("^(" + ATOM + ").*?$"); 171 172 private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + ALPHA_CHARS + "]"); 173 174 /** 175 * Holds the set of current validation options. 176 */ 177 private final Flags options; 178 179 /** 180 * The set of schemes that are allowed to be in a URL. 181 */ 182 private final Set<String> allowedSchemes = new HashSet<>(); 183 184 /** 185 * If no schemes are provided, default to this set. 186 */ 187 protected String[] defaultSchemes = {"http", "https", "ftp"}; 188 189 /** 190 * Create a UrlValidator with default properties. 191 */ 192 public UrlValidator() { 193 this(null); 194 } 195 196 /** 197 * Initialize a UrlValidator with the given validation options. 198 * @param options The options should be set using the public constants declared in 199 * this class. To set multiple options you simply add them together. For example, 200 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 201 */ 202 public UrlValidator(final int options) { 203 this(null, options); 204 } 205 206 /** 207 * Behavior of validation is modified by passing in several strings options: 208 * @param schemes Pass in one or more URL schemes to consider valid, passing in 209 * a null will default to "http,https,ftp" being valid. 210 * If a non-null schemes is specified then all valid schemes must 211 * be specified. Setting the ALLOW_ALL_SCHEMES option will 212 * ignore the contents of schemes. 213 */ 214 public UrlValidator(final String[] schemes) { 215 this(schemes, 0); 216 } 217 218 /** 219 * Behaviour of validation is modified by passing in options: 220 * @param schemes The set of valid schemes. 221 * @param options The options should be set using the public constants declared in 222 * this class. To set multiple options you simply add them together. For example, 223 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 224 */ 225 public UrlValidator(String[] schemes, final int options) { 226 this.options = new Flags(options); 227 228 if (this.options.isOn(ALLOW_ALL_SCHEMES)) { 229 return; 230 } 231 232 if (schemes == null) { 233 schemes = defaultSchemes; 234 } 235 236 allowedSchemes.addAll(Arrays.asList(schemes)); 237 } 238 239 /** 240 * Returns the number of times the token appears in the target. 241 * @param token Token value to be counted. 242 * @param target Target value to count tokens in. 243 * @return the number of tokens. 244 */ 245 protected int countToken(final String token, final String target) { 246 int tokenIndex = 0; 247 int count = 0; 248 while (tokenIndex != -1) { 249 tokenIndex = target.indexOf(token, tokenIndex); 250 if (tokenIndex > -1) { 251 tokenIndex++; 252 count++; 253 } 254 } 255 return count; 256 } 257 258 /** 259 * <p>Checks if a field has a valid URL address.</p> 260 * 261 * @param value The value validation is being performed on. A {@code null} 262 * value is considered invalid. 263 * @return true if the URL is valid. 264 */ 265 public boolean isValid(final String value) { 266 if (value == null || !LEGAL_ASCII_PATTERN.matcher(value).matches()) { 267 return false; 268 } 269 270 // Check the whole url address structure 271 final Matcher urlMatcher = URL_PATTERN.matcher(value); 272 if (!urlMatcher.matches() || !isValidScheme(urlMatcher.group(PARSE_URL_SCHEME)) || !isValidAuthority(urlMatcher.group(PARSE_URL_AUTHORITY)) || !isValidPath(urlMatcher.group(PARSE_URL_PATH))) { 273 return false; 274 } 275 276 if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) { 277 return false; 278 } 279 280 if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) { 281 return false; 282 } 283 284 return true; 285 } 286 287 /** 288 * Returns true if the authority is properly formatted. An authority is the combination 289 * of hostname and port. A {@code null} authority value is considered invalid. 290 * @param authority Authority value to validate. 291 * @return true if authority (hostname and port) is valid. 292 */ 293 protected boolean isValidAuthority(final String authority) { 294 if (authority == null) { 295 return false; 296 } 297 298 final InetAddressValidator inetAddressValidator = 299 InetAddressValidator.getInstance(); 300 301 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority); 302 if (!authorityMatcher.matches()) { 303 return false; 304 } 305 306 boolean hostname = false; 307 // check if authority is IP address or hostname 308 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 309 final boolean ipV4Address = inetAddressValidator.isValid(hostIP); 310 311 if (!ipV4Address) { 312 // Domain is hostname name 313 hostname = DOMAIN_PATTERN.matcher(hostIP).matches(); 314 } 315 316 //rightmost hostname will never start with a digit. 317 if (hostname) { 318 // LOW-TECH FIX FOR VALIDATOR-202 319 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 320 final char[] chars = hostIP.toCharArray(); 321 int size = 1; 322 for (final char element : chars) { 323 if (element == '.') { 324 size++; 325 } 326 } 327 final String[] domainSegment = new String[size]; 328 boolean match = true; 329 int segmentCount = 0; 330 int segmentLength = 0; 331 332 while (match) { 333 final Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP); 334 match = atomMatcher.matches(); 335 if (match) { 336 domainSegment[segmentCount] = atomMatcher.group(1); 337 segmentLength = domainSegment[segmentCount].length() + 1; 338 hostIP = 339 segmentLength >= hostIP.length() 340 ? "" 341 : hostIP.substring(segmentLength); 342 343 segmentCount++; 344 } 345 } 346 final String topLevel = domainSegment[segmentCount - 1]; 347 348 349 // First letter of top level must be an alpha 350 // Make sure there's a host name preceding the authority. 351 if (topLevel.length() < TOP_LEVEL_MIN_LEN || topLevel.length() > TOP_LEVEL_MAX_LEN || !ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches() 352 || segmentCount < 2) { 353 return false; 354 } 355 } 356 357 if (!hostname && !ipV4Address) { 358 return false; 359 } 360 361 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 362 if (port != null && !PORT_PATTERN.matcher(port).matches()) { 363 return false; 364 } 365 366 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 367 if (!GenericValidator.isBlankOrNull(extra)) { 368 return false; 369 } 370 371 return true; 372 } 373 374 /** 375 * Returns true if the given fragment is null or fragments are allowed. 376 * @param fragment Fragment value to validate. 377 * @return true if fragment is valid. 378 */ 379 protected boolean isValidFragment(final String fragment) { 380 if (fragment == null) { 381 return true; 382 } 383 384 return options.isOff(NO_FRAGMENTS); 385 } 386 387 /** 388 * Returns true if the path is valid. A {@code null} value is considered invalid. 389 * @param path Path value to validate. 390 * @return true if path is valid. 391 */ 392 protected boolean isValidPath(final String path) { 393 if (path == null || !PATH_PATTERN.matcher(path).matches()) { 394 return false; 395 } 396 397 final int slash2Count = countToken("//", path); 398 if (options.isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 399 return false; 400 } 401 402 final int slashCount = countToken("/", path); 403 final int dot2Count = countToken("..", path); 404 if (dot2Count > 0 && slashCount - slash2Count - 1 <= dot2Count) { 405 return false; 406 } 407 408 return true; 409 } 410 411 /** 412 * Returns true if the query is null, or it's a properly formatted query string. 413 * @param query Query value to validate. 414 * @return true if query is valid. 415 */ 416 protected boolean isValidQuery(final String query) { 417 if (query == null) { 418 return true; 419 } 420 421 return QUERY_PATTERN.matcher(query).matches(); 422 } 423 424 /** 425 * Validate scheme. If schemes[] was initialized to a non-null, 426 * then only those schemes are allowed. Note this is slightly different 427 * than for the constructor. 428 * @param scheme The scheme to validate. A {@code null} value is considered 429 * invalid. 430 * @return true if valid. 431 */ 432 protected boolean isValidScheme(final String scheme) { 433 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() || options.isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme)) { 434 return false; 435 } 436 437 return true; 438 } 439}