1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.validator.routines; 18 19 import java.io.Serializable; 20 import java.net.URI; 21 import java.net.URISyntaxException; 22 import java.util.Collections; 23 import java.util.HashSet; 24 import java.util.Locale; 25 import java.util.Set; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 29 import org.apache.commons.validator.GenericValidator; 30 31 /** 32 * <p><strong>URL Validation</strong> routines.</p> 33 * Behavior of validation is modified by passing in options: 34 * <ul> 35 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 36 * component.</li> 37 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 38 * included then fragments are flagged as illegal.</li> 39 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 40 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 41 * </ul> 42 * 43 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 44 * https://javascript.internet.com. However, this validation now bears little resemblance 45 * to the php original.</p> 46 * <pre> 47 * Example of usage: 48 * Construct a UrlValidator with valid schemes of "http", and "https". 49 * 50 * String[] schemes = {"http","https"}. 51 * UrlValidator urlValidator = new UrlValidator(schemes); 52 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 53 * System.out.println("URL is valid"); 54 * } else { 55 * System.out.println("URL is invalid"); 56 * } 57 * 58 * prints "URL is invalid" 59 * If instead the default constructor is used. 60 * 61 * UrlValidator urlValidator = new UrlValidator(); 62 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 63 * System.out.println("URL is valid"); 64 * } else { 65 * System.out.println("URL is invalid"); 66 * } 67 * 68 * prints out "URL is valid" 69 * </pre> 70 * 71 * @see 72 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 73 * Uniform Resource Identifiers (URI): Generic Syntax 74 * </a> 75 * 76 * @since 1.4 77 */ 78 public class UrlValidator implements Serializable { 79 80 private static final long serialVersionUID = 7557161713937335013L; 81 82 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max 83 84 /** 85 * Allows all validly formatted schemes to pass validation instead of 86 * supplying a set of valid schemes. 87 */ 88 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 89 90 /** 91 * Allow two slashes in the path component of the URL. 92 */ 93 public static final long ALLOW_2_SLASHES = 1 << 1; 94 95 /** 96 * Enabling this options disallows any URL fragments. 97 */ 98 public static final long NO_FRAGMENTS = 1 << 2; 99 100 /** 101 * Allow local URLs, such as https://localhost/ or https://machine/ . 102 * This enables a broad-brush check, for complex local machine name 103 * validation requirements you should create your validator with 104 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 105 */ 106 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 107 108 /** 109 * Protocol scheme (for example, http, ftp, https). 110 */ 111 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 112 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 113 114 // Drop numeric, and "+-." for now 115 // TODO does not allow for optional userinfo. 116 // Validation of character set is done by isValidAuthority 117 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 118 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123 119 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 120 121 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 122 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 123 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 124 // We assume that password has the same valid chars as user info 125 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 126 127 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 128 private static final String USERINFO_FIELD_REGEX = 129 USERINFO_CHARS_REGEX + "+" + // At least one character for the name 130 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent 131 132 private static final String AUTHORITY_REGEX = 133 "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?"; 134 // 1 for example, user:pass@ 2 3 4 135 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 136 137 private static final int PARSE_AUTHORITY_IPV6 = 1; 138 139 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 140 141 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon 142 143 /** 144 * Should always be empty. The code currently allows spaces. 145 */ 146 private static final int PARSE_AUTHORITY_EXTRA = 4; 147 148 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 149 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 150 151 private static final String QUERY_REGEX = "^(\\S*)$"; 152 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 153 154 /** 155 * If no schemes are provided, default to this set. 156 */ 157 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 158 159 /** 160 * Singleton instance of this class with default schemes and options. 161 */ 162 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 163 164 /** 165 * Returns the singleton instance of this class with default schemes and options. 166 * @return singleton instance with default schemes and options 167 */ 168 public static UrlValidator getInstance() { 169 return DEFAULT_URL_VALIDATOR; 170 } 171 172 /** 173 * Tests whether the given flag is on. If the flag is not a power of 2 174 * (for example, 3) this tests whether the combination of flags is on. 175 * 176 * @param flag Flag value to check. 177 * @param options what to check 178 * @return whether the specified flag value is on. 179 */ 180 private static boolean isOn(final long flag, final long options) { 181 return (options & flag) > 0; 182 } 183 184 /** 185 * Holds the set of current validation options. 186 */ 187 private final long options; 188 189 /** 190 * The set of schemes that are allowed to be in a URL. 191 */ 192 private final Set<String> allowedSchemes; // Must be lower-case 193 194 /** 195 * Regular expressions used to manually validate authorities if IANA 196 * domain name validation isn't desired. 197 */ 198 private final RegexValidator authorityValidator; 199 200 /** 201 * The domain validator. 202 */ 203 private final DomainValidator domainValidator; 204 205 /** 206 * Constructs a new instance with default properties. 207 */ 208 public UrlValidator() { 209 this(null); 210 } 211 212 /** 213 * Constructs a new instance with the given validation options. 214 * @param options The options should be set using the public constants declared in 215 * this class. To set multiple options you simply add them together. For example, 216 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 217 */ 218 public UrlValidator(final long options) { 219 this(null, null, options); 220 } 221 222 /** 223 * Constructs a new instance with the given validation options. 224 * @param authorityValidator Regular expression validator used to validate the authority part 225 * This allows the user to override the standard set of domains. 226 * @param options Validation options. Set using the public constants of this class. 227 * To set multiple options, simply add them together: 228 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p> 229 * enables both of those options. 230 */ 231 public UrlValidator(final RegexValidator authorityValidator, final long options) { 232 this(null, authorityValidator, options); 233 } 234 235 /** 236 * Behavior of validation is modified by passing in several strings options: 237 * @param schemes Pass in one or more URL schemes to consider valid, passing in 238 * a null will default to "http,https,ftp" being valid. 239 * If a non-null schemes is specified then all valid schemes must 240 * be specified. Setting the ALLOW_ALL_SCHEMES option will 241 * ignore the contents of schemes. 242 */ 243 public UrlValidator(final String[] schemes) { 244 this(schemes, 0L); 245 } 246 247 /** 248 * Behavior of validation is modified by passing in options: 249 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 250 * @param options The options should be set using the public constants declared in 251 * this class. To set multiple options you simply add them together. For example, 252 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 253 */ 254 public UrlValidator(final String[] schemes, final long options) { 255 this(schemes, null, options); 256 } 257 258 /** 259 * Customizable constructor. Validation behavior is modified by passing in options. 260 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 261 * @param authorityValidator Regular expression validator used to validate the authority part 262 * @param options Validation options. Set using the public constants of this class. 263 * To set multiple options, simply add them together: 264 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p> 265 * enables both of those options. 266 */ 267 public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) { 268 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options))); 269 } 270 271 /** 272 * Customizable constructor. Validation behavior is modified by passing in options. 273 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 274 * @param authorityValidator Regular expression validator used to validate the authority part 275 * @param options Validation options. Set using the public constants of this class. 276 * To set multiple options, simply add them together: 277 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p> 278 * enables both of those options. 279 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting 280 * @since 1.7 281 */ 282 public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) { 283 this.options = options; 284 if (domainValidator == null) { 285 throw new IllegalArgumentException("DomainValidator must not be null"); 286 } 287 if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) { 288 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting"); 289 } 290 this.domainValidator = domainValidator; 291 292 if (isOn(ALLOW_ALL_SCHEMES)) { 293 allowedSchemes = Collections.emptySet(); 294 } else { 295 if (schemes == null) { 296 schemes = DEFAULT_SCHEMES; 297 } 298 allowedSchemes = new HashSet<>(schemes.length); 299 for (final String scheme : schemes) { 300 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH)); 301 } 302 } 303 304 this.authorityValidator = authorityValidator; 305 } 306 307 /** 308 * Returns the number of times the token appears in the target. 309 * @param token Token value to be counted. 310 * @param target Target value to count tokens in. 311 * @return the number of tokens. 312 */ 313 protected int countToken(final String token, final String target) { 314 int tokenIndex = 0; 315 int count = 0; 316 while (tokenIndex != -1) { 317 tokenIndex = target.indexOf(token, tokenIndex); 318 if (tokenIndex > -1) { 319 tokenIndex++; 320 count++; 321 } 322 } 323 return count; 324 } 325 326 /** 327 * Tests whether the given flag is off. If the flag is not a power of 2 328 * (for example, 3) this tests whether the combination of flags is off. 329 * 330 * @param flag Flag value to check. 331 * @return whether the specified flag value is off. 332 */ 333 private boolean isOff(final long flag) { 334 return (options & flag) == 0; 335 } 336 337 /** 338 * Tests whether the given flag is on. If the flag is not a power of 2 339 * (for example, 3) this tests whether the combination of flags is on. 340 * 341 * @param flag Flag value to check. 342 * @return whether the specified flag value is on. 343 */ 344 private boolean isOn(final long flag) { 345 return (options & flag) > 0; 346 } 347 348 /** 349 * <p>Checks if a field has a valid URL address.</p> 350 * 351 * Note that the method calls #isValidAuthority() 352 * which checks that the domain is valid. 353 * 354 * @param value The value validation is being performed on. A {@code null} 355 * value is considered invalid. 356 * @return true if the URL is valid. 357 */ 358 public boolean isValid(final String value) { 359 if (value == null) { 360 return false; 361 } 362 final URI uri; // ensure value is a valid URI 363 try { 364 uri = new URI(value); 365 } catch (final URISyntaxException e) { 366 return false; 367 } 368 // OK, perform additional validation 369 final String scheme = uri.getScheme(); 370 if (!isValidScheme(scheme)) { 371 return false; 372 } 373 final String authority = uri.getRawAuthority(); 374 if ("file".equals(scheme) && GenericValidator.isBlankOrNull(authority)) { // Special case - file: allows an empty authority 375 return true; // this is a local file - nothing more to do here 376 } 377 // Validate the authority 378 if ("file".equals(scheme) && authority != null && authority.contains(":") || !isValidAuthority(authority)) { 379 return false; 380 } 381 if (!isValidPath(uri.getRawPath()) || !isValidQuery(uri.getRawQuery()) || !isValidFragment(uri.getRawFragment())) { 382 return false; 383 } 384 return true; 385 } 386 387 /** 388 * Returns true if the authority is properly formatted. An authority is the combination 389 * of hostname and port. A {@code null} authority value is considered invalid. 390 * Note: this implementation validates the domain unless a RegexValidator was provided. 391 * If a RegexValidator was supplied, and it matches, then the authority is regarded 392 * as valid with no further checks, otherwise the method checks against the 393 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 394 * @param authority Authority value to validate, allows IDN 395 * @return true if authority (hostname and port) is valid. 396 */ 397 protected boolean isValidAuthority(final String authority) { 398 if (authority == null) { 399 return false; 400 } 401 402 // check manual authority validation if specified 403 if (authorityValidator != null && authorityValidator.isValid(authority)) { 404 return true; 405 } 406 // convert to ASCII if possible 407 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 408 409 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 410 if (!authorityMatcher.matches()) { 411 return false; 412 } 413 414 // We have to process IPV6 separately because that is parsed in a different group 415 final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 416 if (ipv6 != null) { 417 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 418 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 419 return false; 420 } 421 } else { 422 final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 423 // check if authority is hostname or IP address: 424 // try a hostname first since that's much more likely 425 if (!domainValidator.isValid(hostLocation)) { 426 // try an IPv4 address 427 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 428 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 429 // isn't IPv4, so the URL is invalid 430 return false; 431 } 432 } 433 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 434 if (!GenericValidator.isBlankOrNull(port)) { 435 try { 436 final int iPort = Integer.parseInt(port); 437 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) { 438 return false; 439 } 440 } catch (final NumberFormatException nfe) { 441 return false; // this can happen for big numbers 442 } 443 } 444 } 445 446 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 447 if (extra != null && !extra.trim().isEmpty()) { 448 return false; 449 } 450 451 return true; 452 } 453 454 /** 455 * Returns true if the given fragment is null or fragments are allowed. 456 * @param fragment Fragment value to validate. 457 * @return true if fragment is valid. 458 */ 459 protected boolean isValidFragment(final String fragment) { 460 if (fragment == null) { 461 return true; 462 } 463 464 return isOff(NO_FRAGMENTS); 465 } 466 467 /** 468 * Returns true if the path is valid. A {@code null} value is considered invalid. 469 * @param path Path value to validate. 470 * @return true if path is valid. 471 */ 472 protected boolean isValidPath(final String path) { 473 if (path == null || !PATH_PATTERN.matcher(path).matches()) { 474 return false; 475 } 476 477 try { 478 // Don't omit host otherwise leading path may be taken as host if it starts with // 479 final URI uri = new URI(null, "localhost", path, null); 480 final String norm = uri.normalize().getPath(); 481 if (norm.startsWith("/../") // Trying to go via the parent dir 482 || norm.equals("/..")) { // Trying to go to the parent dir 483 return false; 484 } 485 } catch (final URISyntaxException e) { 486 return false; 487 } 488 489 final int slash2Count = countToken("//", path); 490 if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 491 return false; 492 } 493 494 return true; 495 } 496 497 /** 498 * Returns true if the query is null, or it's a properly formatted query string. 499 * @param query Query value to validate. 500 * @return true if query is valid. 501 */ 502 protected boolean isValidQuery(final String query) { 503 if (query == null) { 504 return true; 505 } 506 return QUERY_PATTERN.matcher(query).matches(); 507 } 508 509 /** 510 * Validate scheme. If schemes[] was initialized to a non-null, 511 * then only those schemes are allowed. 512 * Otherwise, the default schemes are "http", "https", "ftp". 513 * Matching is case-blind. 514 * @param scheme The scheme to validate. A {@code null} value is considered 515 * invalid. 516 * @return true if valid. 517 */ 518 protected boolean isValidScheme(final String scheme) { 519 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() 520 || isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 521 return false; 522 } 523 524 return true; 525 } 526 527 }