001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator.routines; 018 019import java.io.Serializable; 020import java.net.URI; 021import java.net.URISyntaxException; 022import java.util.Collections; 023import java.util.HashSet; 024import java.util.Locale; 025import java.util.Set; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028 029/** 030 * <p><b>URL Validation</b> routines.</p> 031 * Behavior of validation is modified by passing in options: 032 * <ul> 033 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 034 * component.</li> 035 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 036 * included then fragments are flagged as illegal.</li> 037 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 038 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 039 * </ul> 040 * 041 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 042 * http://javascript.internet.com. However, this validation now bears little resemblance 043 * to the php original.</p> 044 * <pre> 045 * Example of usage: 046 * Construct a UrlValidator with valid schemes of "http", and "https". 047 * 048 * String[] schemes = {"http","https"}. 049 * UrlValidator urlValidator = new UrlValidator(schemes); 050 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 051 * System.out.println("URL is valid"); 052 * } else { 053 * System.out.println("URL is invalid"); 054 * } 055 * 056 * prints "URL is invalid" 057 * If instead the default constructor is used. 058 * 059 * UrlValidator urlValidator = new UrlValidator(); 060 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 061 * System.out.println("URL is valid"); 062 * } else { 063 * System.out.println("URL is invalid"); 064 * } 065 * 066 * prints out "URL is valid" 067 * </pre> 068 * 069 * @see 070 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 071 * Uniform Resource Identifiers (URI): Generic Syntax 072 * </a> 073 * 074 * @since 1.4 075 */ 076public class UrlValidator implements Serializable { 077 078 private static final long serialVersionUID = 7557161713937335013L; 079 080 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max 081 082 /** 083 * Allows all validly formatted schemes to pass validation instead of 084 * supplying a set of valid schemes. 085 */ 086 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 087 088 /** 089 * Allow two slashes in the path component of the URL. 090 */ 091 public static final long ALLOW_2_SLASHES = 1 << 1; 092 093 /** 094 * Enabling this options disallows any URL fragments. 095 */ 096 public static final long NO_FRAGMENTS = 1 << 2; 097 098 /** 099 * Allow local URLs, such as http://localhost/ or http://machine/ . 100 * This enables a broad-brush check, for complex local machine name 101 * validation requirements you should create your validator with 102 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 103 */ 104 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 105 106 /** 107 * Protocol scheme (e.g. http, ftp, https). 108 */ 109 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 110 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 111 112 // Drop numeric, and "+-." for now 113 // TODO does not allow for optional userinfo. 114 // Validation of character set is done by isValidAuthority 115 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 116 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123 117 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 118 119 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 120 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 121 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 122 // We assume that password has the same valid chars as user info 123 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 124 125 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 126 private static final String USERINFO_FIELD_REGEX = 127 USERINFO_CHARS_REGEX + "+" + // At least one character for the name 128 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent 129 130 private static final String AUTHORITY_REGEX = 131 "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?"; 132 // 1 e.g. user:pass@ 2 3 4 133 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 134 135 private static final int PARSE_AUTHORITY_IPV6 = 1; 136 137 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 138 139 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon 140 141 /** 142 * Should always be empty. The code currently allows spaces. 143 */ 144 private static final int PARSE_AUTHORITY_EXTRA = 4; 145 146 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 147 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 148 149 private static final String QUERY_REGEX = "^(\\S*)$"; 150 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 151 152 /** 153 * If no schemes are provided, default to this set. 154 */ 155 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 156 157 /** 158 * Singleton instance of this class with default schemes and options. 159 */ 160 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 161 162 /** 163 * Returns the singleton instance of this class with default schemes and options. 164 * @return singleton instance with default schemes and options 165 */ 166 public static UrlValidator getInstance() { 167 return DEFAULT_URL_VALIDATOR; 168 } 169 170 /** 171 * Tests whether the given flag is on. If the flag is not a power of 2 172 * (e.g. 3) this tests whether the combination of flags is on. 173 * 174 * @param flag Flag value to check. 175 * @param options what to check 176 * 177 * @return whether the specified flag value is on. 178 */ 179 private static boolean isOn(final long flag, final long options) { 180 return (options & flag) > 0; 181 } 182 183 /** 184 * Holds the set of current validation options. 185 */ 186 private final long options; 187 188 /** 189 * The set of schemes that are allowed to be in a URL. 190 */ 191 private final Set<String> allowedSchemes; // Must be lower-case 192 193 /** 194 * Regular expressions used to manually validate authorities if IANA 195 * domain name validation isn't desired. 196 */ 197 private final RegexValidator authorityValidator; 198 199 private final DomainValidator domainValidator; 200 201 /** 202 * Create a UrlValidator with default properties. 203 */ 204 public UrlValidator() { 205 this(null); 206 } 207 208 /** 209 * Initialize a UrlValidator with the given validation options. 210 * @param options The options should be set using the public constants declared in 211 * this class. To set multiple options you simply add them together. For example, 212 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 213 */ 214 public UrlValidator(final long options) { 215 this(null, null, options); 216 } 217 218 /** 219 * Initialize a UrlValidator with the given validation options. 220 * @param authorityValidator Regular expression validator used to validate the authority part 221 * This allows the user to override the standard set of domains. 222 * @param options Validation options. Set using the public constants of this class. 223 * To set multiple options, simply add them together: 224 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 225 * enables both of those options. 226 */ 227 public UrlValidator(final RegexValidator authorityValidator, final long options) { 228 this(null, authorityValidator, options); 229 } 230 231 /** 232 * Behavior of validation is modified by passing in several strings options: 233 * @param schemes Pass in one or more URL schemes to consider valid, passing in 234 * a null will default to "http,https,ftp" being valid. 235 * If a non-null schemes is specified then all valid schemes must 236 * be specified. Setting the ALLOW_ALL_SCHEMES option will 237 * ignore the contents of schemes. 238 */ 239 public UrlValidator(final String[] schemes) { 240 this(schemes, 0L); 241 } 242 243 /** 244 * Behavior of validation is modified by passing in options: 245 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 246 * @param options The options should be set using the public constants declared in 247 * this class. To set multiple options you simply add them together. For example, 248 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 249 */ 250 public UrlValidator(final String[] schemes, final long options) { 251 this(schemes, null, options); 252 } 253 254 /** 255 * Customizable constructor. Validation behavior is modified by passing in options. 256 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 257 * @param authorityValidator Regular expression validator used to validate the authority part 258 * @param options Validation options. Set using the public constants of this class. 259 * To set multiple options, simply add them together: 260 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 261 * enables both of those options. 262 */ 263 public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) { 264 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options))); 265 } 266 267 /** 268 * Customizable constructor. Validation behavior is modified by passing in options. 269 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 270 * @param authorityValidator Regular expression validator used to validate the authority part 271 * @param options Validation options. Set using the public constants of this class. 272 * To set multiple options, simply add them together: 273 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 274 * enables both of those options. 275 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting 276 * @since 1.7 277 */ 278 public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) { 279 this.options = options; 280 if (domainValidator == null) { 281 throw new IllegalArgumentException("DomainValidator must not be null"); 282 } 283 if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) { 284 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting"); 285 } 286 this.domainValidator = domainValidator; 287 288 if (isOn(ALLOW_ALL_SCHEMES)) { 289 allowedSchemes = Collections.emptySet(); 290 } else { 291 if (schemes == null) { 292 schemes = DEFAULT_SCHEMES; 293 } 294 allowedSchemes = new HashSet<>(schemes.length); 295 for (final String scheme : schemes) { 296 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH)); 297 } 298 } 299 300 this.authorityValidator = authorityValidator; 301 } 302 303 /** 304 * Returns the number of times the token appears in the target. 305 * @param token Token value to be counted. 306 * @param target Target value to count tokens in. 307 * @return the number of tokens. 308 */ 309 protected int countToken(final String token, final String target) { 310 int tokenIndex = 0; 311 int count = 0; 312 while (tokenIndex != -1) { 313 tokenIndex = target.indexOf(token, tokenIndex); 314 if (tokenIndex > -1) { 315 tokenIndex++; 316 count++; 317 } 318 } 319 return count; 320 } 321 322 /** 323 * Tests whether the given flag is off. If the flag is not a power of 2 324 * (ie. 3) this tests whether the combination of flags is off. 325 * 326 * @param flag Flag value to check. 327 * 328 * @return whether the specified flag value is off. 329 */ 330 private boolean isOff(final long flag) { 331 return (options & flag) == 0; 332 } 333 334 /** 335 * Tests whether the given flag is on. If the flag is not a power of 2 336 * (ie. 3) this tests whether the combination of flags is on. 337 * 338 * @param flag Flag value to check. 339 * 340 * @return whether the specified flag value is on. 341 */ 342 private boolean isOn(final long flag) { 343 return (options & flag) > 0; 344 } 345 346 /** 347 * <p>Checks if a field has a valid URL address.</p> 348 * 349 * Note that the method calls #isValidAuthority() 350 * which checks that the domain is valid. 351 * 352 * @param value The value validation is being performed on. A <code>null</code> 353 * value is considered invalid. 354 * @return true if the URL is valid. 355 */ 356 public boolean isValid(final String value) { 357 if (value == null) { 358 return false; 359 } 360 361 URI uri; // ensure value is a valid URI 362 try { 363 uri = new URI(value); 364 } catch (final URISyntaxException e) { 365 return false; 366 } 367 // OK, perform additional validation 368 369 final String scheme = uri.getScheme(); 370 if (!isValidScheme(scheme)) { 371 return false; 372 } 373 374 final String authority = uri.getRawAuthority(); 375 if ("file".equals(scheme) && (authority == null || authority.isEmpty())) { // Special case - file: allows an empty authority 376 return true; // this is a local file - nothing more to do here 377 } 378 if ("file".equals(scheme) && authority != null && authority.contains(":")) { 379 return false; 380 } 381 // Validate the authority 382 if (!isValidAuthority(authority)) { 383 return false; 384 } 385 386 if (!isValidPath(uri.getRawPath())) { 387 return false; 388 } 389 390 if (!isValidQuery(uri.getRawQuery())) { 391 return false; 392 } 393 394 if (!isValidFragment(uri.getRawFragment())) { 395 return false; 396 } 397 398 return true; 399 } 400 401 /** 402 * Returns true if the authority is properly formatted. An authority is the combination 403 * of hostname and port. A <code>null</code> authority value is considered invalid. 404 * Note: this implementation validates the domain unless a RegexValidator was provided. 405 * If a RegexValidator was supplied and it matches, then the authority is regarded 406 * as valid with no further checks, otherwise the method checks against the 407 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 408 * @param authority Authority value to validate, alllows IDN 409 * @return true if authority (hostname and port) is valid. 410 */ 411 protected boolean isValidAuthority(final String authority) { 412 if (authority == null) { 413 return false; 414 } 415 416 // check manual authority validation if specified 417 if (authorityValidator != null && authorityValidator.isValid(authority)) { 418 return true; 419 } 420 // convert to ASCII if possible 421 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 422 423 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 424 if (!authorityMatcher.matches()) { 425 return false; 426 } 427 428 // We have to process IPV6 separately because that is parsed in a different group 429 final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 430 if (ipv6 != null) { 431 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 432 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 433 return false; 434 } 435 } else { 436 final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 437 // check if authority is hostname or IP address: 438 // try a hostname first since that's much more likely 439 if (!this.domainValidator.isValid(hostLocation)) { 440 // try an IPv4 address 441 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 442 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 443 // isn't IPv4, so the URL is invalid 444 return false; 445 } 446 } 447 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 448 if (port != null && !port.isEmpty()) { 449 try { 450 final int iPort = Integer.parseInt(port); 451 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) { 452 return false; 453 } 454 } catch (final NumberFormatException nfe) { 455 return false; // this can happen for big numbers 456 } 457 } 458 } 459 460 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 461 if (extra != null && !extra.trim().isEmpty()) { 462 return false; 463 } 464 465 return true; 466 } 467 468 /** 469 * Returns true if the given fragment is null or fragments are allowed. 470 * @param fragment Fragment value to validate. 471 * @return true if fragment is valid. 472 */ 473 protected boolean isValidFragment(final String fragment) { 474 if (fragment == null) { 475 return true; 476 } 477 478 return isOff(NO_FRAGMENTS); 479 } 480 481 /** 482 * Returns true if the path is valid. A <code>null</code> value is considered invalid. 483 * @param path Path value to validate. 484 * @return true if path is valid. 485 */ 486 protected boolean isValidPath(final String path) { 487 if (path == null) { 488 return false; 489 } 490 491 if (!PATH_PATTERN.matcher(path).matches()) { 492 return false; 493 } 494 495 try { 496 // Don't omit host otherwise leading path may be taken as host if it starts with // 497 final URI uri = new URI(null,"localhost",path,null); 498 final String norm = uri.normalize().getPath(); 499 if (norm.startsWith("/../") // Trying to go via the parent dir 500 || norm.equals("/..")) { // Trying to go to the parent dir 501 return false; 502 } 503 } catch (final URISyntaxException e) { 504 return false; 505 } 506 507 final int slash2Count = countToken("//", path); 508 if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 509 return false; 510 } 511 512 return true; 513 } 514 515 /** 516 * Returns true if the query is null or it's a properly formatted query string. 517 * @param query Query value to validate. 518 * @return true if query is valid. 519 */ 520 protected boolean isValidQuery(final String query) { 521 if (query == null) { 522 return true; 523 } 524 525 return QUERY_PATTERN.matcher(query).matches(); 526 } 527 528 /** 529 * Validate scheme. If schemes[] was initialized to a non null, 530 * then only those schemes are allowed. 531 * Otherwise the default schemes are "http", "https", "ftp". 532 * Matching is case-blind. 533 * @param scheme The scheme to validate. A <code>null</code> value is considered 534 * invalid. 535 * @return true if valid. 536 */ 537 protected boolean isValidScheme(final String scheme) { 538 if (scheme == null) { 539 return false; 540 } 541 542 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 543 return false; 544 } 545 546 if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 547 return false; 548 } 549 550 return true; 551 } 552 553}