1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.validator.routines;
18
19 import java.io.Serializable;
20 import java.net.URI;
21 import java.net.URISyntaxException;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Locale;
25 import java.util.Set;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.commons.validator.GenericValidator;
30
31 /**
32 * <p><strong>URL Validation</strong> routines.</p>
33 * Behavior of validation is modified by passing in options:
34 * <ul>
35 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
36 * component.</li>
37 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
38 * included then fragments are flagged as illegal.</li>
39 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
40 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
41 * </ul>
42 *
43 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
44 * https://javascript.internet.com. However, this validation now bears little resemblance
45 * to the php original.</p>
46 * <pre>
47 * Example of usage:
48 * Construct a UrlValidator with valid schemes of "http", and "https".
49 *
50 * String[] schemes = {"http","https"}.
51 * UrlValidator urlValidator = new UrlValidator(schemes);
52 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
53 * System.out.println("URL is valid");
54 * } else {
55 * System.out.println("URL is invalid");
56 * }
57 *
58 * prints "URL is invalid"
59 * If instead the default constructor is used.
60 *
61 * UrlValidator urlValidator = new UrlValidator();
62 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
63 * System.out.println("URL is valid");
64 * } else {
65 * System.out.println("URL is invalid");
66 * }
67 *
68 * prints out "URL is valid"
69 * </pre>
70 *
71 * @see
72 * <a href="https://www.ietf.org/rfc/rfc2396.txt">
73 * Uniform Resource Identifiers (URI): Generic Syntax
74 * </a>
75 *
76 * @since 1.4
77 */
78 public class UrlValidator implements Serializable {
79
80 private static final long serialVersionUID = 7557161713937335013L;
81
82 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max
83
84 /**
85 * Allows all validly formatted schemes to pass validation instead of
86 * supplying a set of valid schemes.
87 */
88 public static final long ALLOW_ALL_SCHEMES = 1 << 0;
89
90 /**
91 * Allow two slashes in the path component of the URL.
92 */
93 public static final long ALLOW_2_SLASHES = 1 << 1;
94
95 /**
96 * Enabling this options disallows any URL fragments.
97 */
98 public static final long NO_FRAGMENTS = 1 << 2;
99
100 /**
101 * Allow local URLs, such as https://localhost/ or https://machine/ .
102 * This enables a broad-brush check, for complex local machine name
103 * validation requirements you should create your validator with
104 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
105 */
106 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber
107
108 /**
109 * Protocol scheme (for example, http, ftp, https).
110 */
111 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
112 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
113
114 // Drop numeric, and "+-." for now
115 // TODO does not allow for optional userinfo.
116 // Validation of character set is done by isValidAuthority
117 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
118 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123
119 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix
120
121 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
122 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
123 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
124 // We assume that password has the same valid chars as user info
125 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";
126
127 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
128 private static final String USERINFO_FIELD_REGEX =
129 USERINFO_CHARS_REGEX + "+" + // At least one character for the name
130 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent
131
132 private static final String AUTHORITY_REGEX =
133 "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?";
134 // 1 for example, user:pass@ 2 3 4
135 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
136
137 private static final int PARSE_AUTHORITY_IPV6 = 1;
138
139 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present
140
141 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon
142
143 /**
144 * Should always be empty. The code currently allows spaces.
145 */
146 private static final int PARSE_AUTHORITY_EXTRA = 4;
147
148 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
149 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
150
151 private static final String QUERY_REGEX = "^(\\S*)$";
152 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
153
154 /**
155 * If no schemes are provided, default to this set.
156 */
157 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case
158
159 /**
160 * Singleton instance of this class with default schemes and options.
161 */
162 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
163
164 /**
165 * Returns the singleton instance of this class with default schemes and options.
166 *
167 * @return singleton instance with default schemes and options
168 */
169 public static UrlValidator getInstance() {
170 return DEFAULT_URL_VALIDATOR;
171 }
172
173 /**
174 * Tests whether the given flag is on. If the flag is not a power of 2
175 * (for example, 3) this tests whether the combination of flags is on.
176 *
177 * @param flag Flag value to check.
178 * @param options what to check
179 * @return whether the specified flag value is on.
180 */
181 private static boolean isOn(final long flag, final long options) {
182 return (options & flag) > 0;
183 }
184
185 /**
186 * Holds the set of current validation options.
187 */
188 private final long options;
189
190 /**
191 * The set of schemes that are allowed to be in a URL.
192 */
193 private final Set<String> allowedSchemes; // Must be lower-case
194
195 /**
196 * Regular expressions used to manually validate authorities if IANA
197 * domain name validation isn't desired.
198 */
199 private final RegexValidator authorityValidator;
200
201 /**
202 * The domain validator.
203 */
204 private final DomainValidator domainValidator;
205
206 /**
207 * Constructs a new instance with default properties.
208 */
209 public UrlValidator() {
210 this(null);
211 }
212
213 /**
214 * Constructs a new instance with the given validation options.
215 *
216 * @param options The options should be set using the public constants declared in
217 * this class. To set multiple options you simply add them together. For example,
218 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
219 */
220 public UrlValidator(final long options) {
221 this(null, null, options);
222 }
223
224 /**
225 * Constructs a new instance with the given validation options.
226 *
227 * @param authorityValidator Regular expression validator used to validate the authority part
228 * This allows the user to override the standard set of domains.
229 * @param options Validation options. Set using the public constants of this class.
230 * To set multiple options, simply add them together:
231 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p>
232 * enables both of those options.
233 */
234 public UrlValidator(final RegexValidator authorityValidator, final long options) {
235 this(null, authorityValidator, options);
236 }
237
238 /**
239 * Behavior of validation is modified by passing in several strings options:
240 *
241 * @param schemes Pass in one or more URL schemes to consider valid, passing in
242 * a null will default to "http,https,ftp" being valid.
243 * If a non-null schemes is specified then all valid schemes must
244 * be specified. Setting the ALLOW_ALL_SCHEMES option will
245 * ignore the contents of schemes.
246 */
247 public UrlValidator(final String[] schemes) {
248 this(schemes, 0L);
249 }
250
251 /**
252 * Behavior of validation is modified by passing in options:
253 *
254 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
255 * @param options The options should be set using the public constants declared in
256 * this class. To set multiple options you simply add them together. For example,
257 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
258 */
259 public UrlValidator(final String[] schemes, final long options) {
260 this(schemes, null, options);
261 }
262
263 /**
264 * Customizable constructor. Validation behavior is modified by passing in options.
265 *
266 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
267 * @param authorityValidator Regular expression validator used to validate the authority part
268 * @param options Validation options. Set using the public constants of this class.
269 * To set multiple options, simply add them together:
270 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p>
271 * enables both of those options.
272 */
273 public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) {
274 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options)));
275 }
276
277 /**
278 * Customizable constructor. Validation behavior is modified by passing in options.
279 *
280 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
281 * @param authorityValidator Regular expression validator used to validate the authority part
282 * @param options Validation options. Set using the public constants of this class.
283 * To set multiple options, simply add them together:
284 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p>
285 * enables both of those options.
286 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting
287 * @since 1.7
288 */
289 public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) {
290 this.options = options;
291 if (domainValidator == null) {
292 throw new IllegalArgumentException("DomainValidator must not be null");
293 }
294 if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) {
295 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting");
296 }
297 this.domainValidator = domainValidator;
298
299 if (isOn(ALLOW_ALL_SCHEMES)) {
300 allowedSchemes = Collections.emptySet();
301 } else {
302 if (schemes == null) {
303 schemes = DEFAULT_SCHEMES;
304 }
305 allowedSchemes = new HashSet<>(schemes.length);
306 for (final String scheme : schemes) {
307 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH));
308 }
309 }
310
311 this.authorityValidator = authorityValidator;
312 }
313
314 /**
315 * Returns the number of times the token appears in the target.
316 *
317 * @param token Token value to be counted.
318 * @param target Target value to count tokens in.
319 * @return the number of tokens.
320 */
321 protected int countToken(final String token, final String target) {
322 int tokenIndex = 0;
323 int count = 0;
324 while (tokenIndex != -1) {
325 tokenIndex = target.indexOf(token, tokenIndex);
326 if (tokenIndex > -1) {
327 tokenIndex++;
328 count++;
329 }
330 }
331 return count;
332 }
333
334 /**
335 * Tests whether the given flag is off. If the flag is not a power of 2
336 * (for example, 3) this tests whether the combination of flags is off.
337 *
338 * @param flag Flag value to check.
339 * @return whether the specified flag value is off.
340 */
341 private boolean isOff(final long flag) {
342 return (options & flag) == 0;
343 }
344
345 /**
346 * Tests whether the given flag is on. If the flag is not a power of 2
347 * (for example, 3) this tests whether the combination of flags is on.
348 *
349 * @param flag Flag value to check.
350 * @return whether the specified flag value is on.
351 */
352 private boolean isOn(final long flag) {
353 return (options & flag) > 0;
354 }
355
356 /**
357 * <p>Checks if a field has a valid URL address.</p>
358 *
359 * Note that the method calls #isValidAuthority()
360 * which checks that the domain is valid.
361 *
362 * @param value The value validation is being performed on. A {@code null}
363 * value is considered invalid.
364 * @return true if the URL is valid.
365 */
366 public boolean isValid(final String value) {
367 if (value == null) {
368 return false;
369 }
370 final URI uri; // ensure value is a valid URI
371 try {
372 uri = new URI(value);
373 } catch (final URISyntaxException e) {
374 return false;
375 }
376 // OK, perform additional validation
377 final String scheme = uri.getScheme();
378 if (!isValidScheme(scheme)) {
379 return false;
380 }
381 final String authority = uri.getRawAuthority();
382 if ("file".equals(scheme) && GenericValidator.isBlankOrNull(authority)) { // Special case - file: allows an empty authority
383 return true; // this is a local file - nothing more to do here
384 }
385 // Validate the authority
386 if ("file".equals(scheme) && authority != null && authority.contains(":") || !isValidAuthority(authority)) {
387 return false;
388 }
389 if (!isValidPath(uri.getRawPath()) || !isValidQuery(uri.getRawQuery()) || !isValidFragment(uri.getRawFragment())) {
390 return false;
391 }
392 return true;
393 }
394
395 /**
396 * Returns true if the authority is properly formatted. An authority is the combination
397 * of hostname and port. A {@code null} authority value is considered invalid.
398 * Note: this implementation validates the domain unless a RegexValidator was provided.
399 * If a RegexValidator was supplied, and it matches, then the authority is regarded
400 * as valid with no further checks, otherwise the method checks against the
401 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
402 *
403 * @param authority Authority value to validate, allows IDN
404 * @return true if authority (hostname and port) is valid.
405 */
406 protected boolean isValidAuthority(final String authority) {
407 if (authority == null) {
408 return false;
409 }
410
411 // check manual authority validation if specified
412 if (authorityValidator != null && authorityValidator.isValid(authority)) {
413 return true;
414 }
415 // convert to ASCII if possible
416 final String authorityASCII = DomainValidator.unicodeToASCII(authority);
417
418 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
419 if (!authorityMatcher.matches()) {
420 return false;
421 }
422
423 // We have to process IPV6 separately because that is parsed in a different group
424 final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
425 if (ipv6 != null) {
426 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
427 if (!inetAddressValidator.isValidInet6Address(ipv6)) {
428 return false;
429 }
430 } else {
431 final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
432 // check if authority is hostname or IP address:
433 // try a hostname first since that's much more likely
434 if (!domainValidator.isValid(hostLocation)) {
435 // try an IPv4 address
436 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
437 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
438 // isn't IPv4, so the URL is invalid
439 return false;
440 }
441 }
442 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
443 if (!GenericValidator.isBlankOrNull(port)) {
444 try {
445 final int iPort = Integer.parseInt(port);
446 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) {
447 return false;
448 }
449 } catch (final NumberFormatException nfe) {
450 return false; // this can happen for big numbers
451 }
452 }
453 }
454
455 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
456 if (extra != null && !extra.trim().isEmpty()) {
457 return false;
458 }
459
460 return true;
461 }
462
463 /**
464 * Returns true if the given fragment is null or fragments are allowed.
465 *
466 * @param fragment Fragment value to validate.
467 * @return true if fragment is valid.
468 */
469 protected boolean isValidFragment(final String fragment) {
470 if (fragment == null) {
471 return true;
472 }
473
474 return isOff(NO_FRAGMENTS);
475 }
476
477 /**
478 * Returns true if the path is valid. A {@code null} value is considered invalid.
479 *
480 * @param path Path value to validate.
481 * @return true if path is valid.
482 */
483 protected boolean isValidPath(final String path) {
484 if (path == null || !PATH_PATTERN.matcher(path).matches()) {
485 return false;
486 }
487
488 try {
489 // Don't omit host otherwise leading path may be taken as host if it starts with //
490 final URI uri = new URI(null, "localhost", path, null);
491 final String norm = uri.normalize().getPath();
492 if (norm.startsWith("/../") // Trying to go via the parent dir
493 || norm.equals("/..")) { // Trying to go to the parent dir
494 return false;
495 }
496 } catch (final URISyntaxException e) {
497 return false;
498 }
499
500 final int slash2Count = countToken("//", path);
501 if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
502 return false;
503 }
504
505 return true;
506 }
507
508 /**
509 * Returns true if the query is null, or it's a properly formatted query string.
510 *
511 * @param query Query value to validate.
512 * @return true if query is valid.
513 */
514 protected boolean isValidQuery(final String query) {
515 if (query == null) {
516 return true;
517 }
518 return QUERY_PATTERN.matcher(query).matches();
519 }
520
521 /**
522 * Validate scheme. If schemes[] was initialized to a non-null,
523 * then only those schemes are allowed.
524 * Otherwise, the default schemes are "http", "https", "ftp".
525 * Matching is case-blind.
526 *
527 * @param scheme The scheme to validate. A {@code null} value is considered
528 * invalid.
529 * @return true if valid.
530 */
531 protected boolean isValidScheme(final String scheme) {
532 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches()
533 || isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
534 return false;
535 }
536
537 return true;
538 }
539
540 }