1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.validator.routines;
18
19 import java.io.Serializable;
20 import java.net.URI;
21 import java.net.URISyntaxException;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Locale;
25 import java.util.Set;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.commons.validator.GenericValidator;
30
31 /**
32 * <p><strong>URL Validation</strong> routines.</p>
33 * Behavior of validation is modified by passing in options:
34 * <ul>
35 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
36 * component.</li>
37 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
38 * included then fragments are flagged as illegal.</li>
39 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
40 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
41 * </ul>
42 *
43 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
44 * https://javascript.internet.com. However, this validation now bears little resemblance
45 * to the php original.</p>
46 * <pre>
47 * Example of usage:
48 * Construct a UrlValidator with valid schemes of "http", and "https".
49 *
50 * String[] schemes = {"http","https"}.
51 * UrlValidator urlValidator = new UrlValidator(schemes);
52 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
53 * System.out.println("URL is valid");
54 * } else {
55 * System.out.println("URL is invalid");
56 * }
57 *
58 * prints "URL is invalid"
59 * If instead the default constructor is used.
60 *
61 * UrlValidator urlValidator = new UrlValidator();
62 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
63 * System.out.println("URL is valid");
64 * } else {
65 * System.out.println("URL is invalid");
66 * }
67 *
68 * prints out "URL is valid"
69 * </pre>
70 *
71 * @see
72 * <a href="https://www.ietf.org/rfc/rfc2396.txt">
73 * Uniform Resource Identifiers (URI): Generic Syntax
74 * </a>
75 *
76 * @since 1.4
77 */
78 public class UrlValidator implements Serializable {
79
80 private static final long serialVersionUID = 7557161713937335013L;
81
82 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max
83
84 /**
85 * Allows all validly formatted schemes to pass validation instead of
86 * supplying a set of valid schemes.
87 */
88 public static final long ALLOW_ALL_SCHEMES = 1 << 0;
89
90 /**
91 * Allow two slashes in the path component of the URL.
92 */
93 public static final long ALLOW_2_SLASHES = 1 << 1;
94
95 /**
96 * Enabling this options disallows any URL fragments.
97 */
98 public static final long NO_FRAGMENTS = 1 << 2;
99
100 /**
101 * Allow local URLs, such as https://localhost/ or https://machine/ .
102 * This enables a broad-brush check, for complex local machine name
103 * validation requirements you should create your validator with
104 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
105 */
106 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber
107
108 /**
109 * Protocol scheme (for example, http, ftp, https).
110 */
111 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
112 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
113
114 // Drop numeric, and "+-." for now
115 // TODO does not allow for optional userinfo.
116 // Validation of character set is done by isValidAuthority
117 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
118 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123
119 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix
120
121 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
122 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
123 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
124 // We assume that password has the same valid chars as user info
125 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";
126
127 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
128 private static final String USERINFO_FIELD_REGEX =
129 USERINFO_CHARS_REGEX + "+" + // At least one character for the name
130 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent
131
132 private static final String AUTHORITY_REGEX =
133 "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?";
134 // 1 for example, user:pass@ 2 3 4
135 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
136
137 private static final int PARSE_AUTHORITY_IPV6 = 1;
138
139 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present
140
141 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon
142
143 /**
144 * Should always be empty. The code currently allows spaces.
145 */
146 private static final int PARSE_AUTHORITY_EXTRA = 4;
147
148 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
149 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
150
151 private static final String QUERY_REGEX = "^(\\S*)$";
152 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
153
154 /**
155 * If no schemes are provided, default to this set.
156 */
157 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case
158
159 /**
160 * Singleton instance of this class with default schemes and options.
161 */
162 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
163
164 /**
165 * Returns the singleton instance of this class with default schemes and options.
166 * @return singleton instance with default schemes and options
167 */
168 public static UrlValidator getInstance() {
169 return DEFAULT_URL_VALIDATOR;
170 }
171
172 /**
173 * Tests whether the given flag is on. If the flag is not a power of 2
174 * (for example, 3) this tests whether the combination of flags is on.
175 *
176 * @param flag Flag value to check.
177 * @param options what to check
178 * @return whether the specified flag value is on.
179 */
180 private static boolean isOn(final long flag, final long options) {
181 return (options & flag) > 0;
182 }
183
184 /**
185 * Holds the set of current validation options.
186 */
187 private final long options;
188
189 /**
190 * The set of schemes that are allowed to be in a URL.
191 */
192 private final Set<String> allowedSchemes; // Must be lower-case
193
194 /**
195 * Regular expressions used to manually validate authorities if IANA
196 * domain name validation isn't desired.
197 */
198 private final RegexValidator authorityValidator;
199
200 /**
201 * The domain validator.
202 */
203 private final DomainValidator domainValidator;
204
205 /**
206 * Constructs a new instance with default properties.
207 */
208 public UrlValidator() {
209 this(null);
210 }
211
212 /**
213 * Constructs a new instance with the given validation options.
214 * @param options The options should be set using the public constants declared in
215 * this class. To set multiple options you simply add them together. For example,
216 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
217 */
218 public UrlValidator(final long options) {
219 this(null, null, options);
220 }
221
222 /**
223 * Constructs a new instance with the given validation options.
224 * @param authorityValidator Regular expression validator used to validate the authority part
225 * This allows the user to override the standard set of domains.
226 * @param options Validation options. Set using the public constants of this class.
227 * To set multiple options, simply add them together:
228 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p>
229 * enables both of those options.
230 */
231 public UrlValidator(final RegexValidator authorityValidator, final long options) {
232 this(null, authorityValidator, options);
233 }
234
235 /**
236 * Behavior of validation is modified by passing in several strings options:
237 * @param schemes Pass in one or more URL schemes to consider valid, passing in
238 * a null will default to "http,https,ftp" being valid.
239 * If a non-null schemes is specified then all valid schemes must
240 * be specified. Setting the ALLOW_ALL_SCHEMES option will
241 * ignore the contents of schemes.
242 */
243 public UrlValidator(final String[] schemes) {
244 this(schemes, 0L);
245 }
246
247 /**
248 * Behavior of validation is modified by passing in options:
249 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
250 * @param options The options should be set using the public constants declared in
251 * this class. To set multiple options you simply add them together. For example,
252 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
253 */
254 public UrlValidator(final String[] schemes, final long options) {
255 this(schemes, null, options);
256 }
257
258 /**
259 * Customizable constructor. Validation behavior is modified by passing in options.
260 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
261 * @param authorityValidator Regular expression validator used to validate the authority part
262 * @param options Validation options. Set using the public constants of this class.
263 * To set multiple options, simply add them together:
264 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p>
265 * enables both of those options.
266 */
267 public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) {
268 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options)));
269 }
270
271 /**
272 * Customizable constructor. Validation behavior is modified by passing in options.
273 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
274 * @param authorityValidator Regular expression validator used to validate the authority part
275 * @param options Validation options. Set using the public constants of this class.
276 * To set multiple options, simply add them together:
277 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p>
278 * enables both of those options.
279 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting
280 * @since 1.7
281 */
282 public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) {
283 this.options = options;
284 if (domainValidator == null) {
285 throw new IllegalArgumentException("DomainValidator must not be null");
286 }
287 if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) {
288 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting");
289 }
290 this.domainValidator = domainValidator;
291
292 if (isOn(ALLOW_ALL_SCHEMES)) {
293 allowedSchemes = Collections.emptySet();
294 } else {
295 if (schemes == null) {
296 schemes = DEFAULT_SCHEMES;
297 }
298 allowedSchemes = new HashSet<>(schemes.length);
299 for (final String scheme : schemes) {
300 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH));
301 }
302 }
303
304 this.authorityValidator = authorityValidator;
305 }
306
307 /**
308 * Returns the number of times the token appears in the target.
309 * @param token Token value to be counted.
310 * @param target Target value to count tokens in.
311 * @return the number of tokens.
312 */
313 protected int countToken(final String token, final String target) {
314 int tokenIndex = 0;
315 int count = 0;
316 while (tokenIndex != -1) {
317 tokenIndex = target.indexOf(token, tokenIndex);
318 if (tokenIndex > -1) {
319 tokenIndex++;
320 count++;
321 }
322 }
323 return count;
324 }
325
326 /**
327 * Tests whether the given flag is off. If the flag is not a power of 2
328 * (for example, 3) this tests whether the combination of flags is off.
329 *
330 * @param flag Flag value to check.
331 * @return whether the specified flag value is off.
332 */
333 private boolean isOff(final long flag) {
334 return (options & flag) == 0;
335 }
336
337 /**
338 * Tests whether the given flag is on. If the flag is not a power of 2
339 * (for example, 3) this tests whether the combination of flags is on.
340 *
341 * @param flag Flag value to check.
342 * @return whether the specified flag value is on.
343 */
344 private boolean isOn(final long flag) {
345 return (options & flag) > 0;
346 }
347
348 /**
349 * <p>Checks if a field has a valid URL address.</p>
350 *
351 * Note that the method calls #isValidAuthority()
352 * which checks that the domain is valid.
353 *
354 * @param value The value validation is being performed on. A {@code null}
355 * value is considered invalid.
356 * @return true if the URL is valid.
357 */
358 public boolean isValid(final String value) {
359 if (value == null) {
360 return false;
361 }
362 final URI uri; // ensure value is a valid URI
363 try {
364 uri = new URI(value);
365 } catch (final URISyntaxException e) {
366 return false;
367 }
368 // OK, perform additional validation
369 final String scheme = uri.getScheme();
370 if (!isValidScheme(scheme)) {
371 return false;
372 }
373 final String authority = uri.getRawAuthority();
374 if ("file".equals(scheme) && GenericValidator.isBlankOrNull(authority)) { // Special case - file: allows an empty authority
375 return true; // this is a local file - nothing more to do here
376 }
377 // Validate the authority
378 if ("file".equals(scheme) && authority != null && authority.contains(":") || !isValidAuthority(authority)) {
379 return false;
380 }
381 if (!isValidPath(uri.getRawPath()) || !isValidQuery(uri.getRawQuery()) || !isValidFragment(uri.getRawFragment())) {
382 return false;
383 }
384 return true;
385 }
386
387 /**
388 * Returns true if the authority is properly formatted. An authority is the combination
389 * of hostname and port. A {@code null} authority value is considered invalid.
390 * Note: this implementation validates the domain unless a RegexValidator was provided.
391 * If a RegexValidator was supplied, and it matches, then the authority is regarded
392 * as valid with no further checks, otherwise the method checks against the
393 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
394 * @param authority Authority value to validate, allows IDN
395 * @return true if authority (hostname and port) is valid.
396 */
397 protected boolean isValidAuthority(final String authority) {
398 if (authority == null) {
399 return false;
400 }
401
402 // check manual authority validation if specified
403 if (authorityValidator != null && authorityValidator.isValid(authority)) {
404 return true;
405 }
406 // convert to ASCII if possible
407 final String authorityASCII = DomainValidator.unicodeToASCII(authority);
408
409 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
410 if (!authorityMatcher.matches()) {
411 return false;
412 }
413
414 // We have to process IPV6 separately because that is parsed in a different group
415 final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
416 if (ipv6 != null) {
417 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
418 if (!inetAddressValidator.isValidInet6Address(ipv6)) {
419 return false;
420 }
421 } else {
422 final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
423 // check if authority is hostname or IP address:
424 // try a hostname first since that's much more likely
425 if (!domainValidator.isValid(hostLocation)) {
426 // try an IPv4 address
427 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
428 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
429 // isn't IPv4, so the URL is invalid
430 return false;
431 }
432 }
433 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
434 if (!GenericValidator.isBlankOrNull(port)) {
435 try {
436 final int iPort = Integer.parseInt(port);
437 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) {
438 return false;
439 }
440 } catch (final NumberFormatException nfe) {
441 return false; // this can happen for big numbers
442 }
443 }
444 }
445
446 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
447 if (extra != null && !extra.trim().isEmpty()) {
448 return false;
449 }
450
451 return true;
452 }
453
454 /**
455 * Returns true if the given fragment is null or fragments are allowed.
456 * @param fragment Fragment value to validate.
457 * @return true if fragment is valid.
458 */
459 protected boolean isValidFragment(final String fragment) {
460 if (fragment == null) {
461 return true;
462 }
463
464 return isOff(NO_FRAGMENTS);
465 }
466
467 /**
468 * Returns true if the path is valid. A {@code null} value is considered invalid.
469 * @param path Path value to validate.
470 * @return true if path is valid.
471 */
472 protected boolean isValidPath(final String path) {
473 if (path == null || !PATH_PATTERN.matcher(path).matches()) {
474 return false;
475 }
476
477 try {
478 // Don't omit host otherwise leading path may be taken as host if it starts with //
479 final URI uri = new URI(null, "localhost", path, null);
480 final String norm = uri.normalize().getPath();
481 if (norm.startsWith("/../") // Trying to go via the parent dir
482 || norm.equals("/..")) { // Trying to go to the parent dir
483 return false;
484 }
485 } catch (final URISyntaxException e) {
486 return false;
487 }
488
489 final int slash2Count = countToken("//", path);
490 if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
491 return false;
492 }
493
494 return true;
495 }
496
497 /**
498 * Returns true if the query is null, or it's a properly formatted query string.
499 * @param query Query value to validate.
500 * @return true if query is valid.
501 */
502 protected boolean isValidQuery(final String query) {
503 if (query == null) {
504 return true;
505 }
506 return QUERY_PATTERN.matcher(query).matches();
507 }
508
509 /**
510 * Validate scheme. If schemes[] was initialized to a non-null,
511 * then only those schemes are allowed.
512 * Otherwise, the default schemes are "http", "https", "ftp".
513 * Matching is case-blind.
514 * @param scheme The scheme to validate. A {@code null} value is considered
515 * invalid.
516 * @return true if valid.
517 */
518 protected boolean isValidScheme(final String scheme) {
519 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches()
520 || isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
521 return false;
522 }
523
524 return true;
525 }
526
527 }