View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.validator.routines;
18  
19  import java.io.Serializable;
20  import java.net.URI;
21  import java.net.URISyntaxException;
22  import java.util.Collections;
23  import java.util.HashSet;
24  import java.util.Locale;
25  import java.util.Set;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  /**
30   * <p><b>URL Validation</b> routines.</p>
31   * Behavior of validation is modified by passing in options:
32   * <ul>
33   * <li>ALLOW_2_SLASHES - [FALSE]  Allows double '/' characters in the path
34   * component.</li>
35   * <li>NO_FRAGMENT- [FALSE]  By default fragments are allowed, if this option is
36   * included then fragments are flagged as illegal.</li>
37   * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
38   * considered valid schemes.  Enabling this option will let any scheme pass validation.</li>
39   * </ul>
40   *
41   * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
42   * http://javascript.internet.com. However, this validation now bears little resemblance
43   * to the php original.</p>
44   * <pre>
45   *   Example of usage:
46   *   Construct a UrlValidator with valid schemes of "http", and "https".
47   *
48   *    String[] schemes = {"http","https"}.
49   *    UrlValidator urlValidator = new UrlValidator(schemes);
50   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
51   *       System.out.println("URL is valid");
52   *    } else {
53   *       System.out.println("URL is invalid");
54   *    }
55   *
56   *    prints "URL is invalid"
57   *   If instead the default constructor is used.
58   *
59   *    UrlValidator urlValidator = new UrlValidator();
60   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
61   *       System.out.println("URL is valid");
62   *    } else {
63   *       System.out.println("URL is invalid");
64   *    }
65   *
66   *   prints out "URL is valid"
67   *  </pre>
68   *
69   * @see
70   * <a href="http://www.ietf.org/rfc/rfc2396.txt">
71   *  Uniform Resource Identifiers (URI): Generic Syntax
72   * </a>
73   *
74   * @since 1.4
75   */
76  public class UrlValidator implements Serializable {
77  
78      private static final long serialVersionUID = 7557161713937335013L;
79  
80      private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max
81  
82      /**
83       * Allows all validly formatted schemes to pass validation instead of
84       * supplying a set of valid schemes.
85       */
86      public static final long ALLOW_ALL_SCHEMES = 1 << 0;
87  
88      /**
89       * Allow two slashes in the path component of the URL.
90       */
91      public static final long ALLOW_2_SLASHES = 1 << 1;
92  
93      /**
94       * Enabling this options disallows any URL fragments.
95       */
96      public static final long NO_FRAGMENTS = 1 << 2;
97  
98      /**
99       * Allow local URLs, such as http://localhost/ or http://machine/ .
100      * This enables a broad-brush check, for complex local machine name
101      *  validation requirements you should create your validator with
102      *  a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
103      */
104     public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber
105 
106     /**
107      * Protocol scheme (e.g. http, ftp, https).
108      */
109     private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
110     private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
111 
112     // Drop numeric, and  "+-." for now
113     // TODO does not allow for optional userinfo.
114     // Validation of character set is done by isValidAuthority
115     private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
116     // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123
117     private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix
118 
119     // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
120     // unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
121     // sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
122     // We assume that password has the same valid chars as user info
123     private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";
124 
125     // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
126     private static final String USERINFO_FIELD_REGEX =
127             USERINFO_CHARS_REGEX + "+" + // At least one character for the name
128             "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent
129 
130     private static final String AUTHORITY_REGEX =
131             "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?";
132     //             1                                 e.g. user:pass@           2                                       3       4
133     private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
134 
135     private static final int PARSE_AUTHORITY_IPV6 = 1;
136 
137     private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present
138 
139     private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon
140 
141     /**
142      * Should always be empty. The code currently allows spaces.
143      */
144     private static final int PARSE_AUTHORITY_EXTRA = 4;
145 
146     private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
147     private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
148 
149     private static final String QUERY_REGEX = "^(\\S*)$";
150     private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
151 
152     /**
153      * If no schemes are provided, default to this set.
154      */
155     private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case
156 
157     /**
158      * Singleton instance of this class with default schemes and options.
159      */
160     private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
161 
162     /**
163      * Returns the singleton instance of this class with default schemes and options.
164      * @return singleton instance with default schemes and options
165      */
166     public static UrlValidator getInstance() {
167         return DEFAULT_URL_VALIDATOR;
168     }
169 
170     /**
171      * Tests whether the given flag is on.  If the flag is not a power of 2
172      * (e.g. 3) this tests whether the combination of flags is on.
173      *
174      * @param flag Flag value to check.
175      * @param options what to check
176      *
177      * @return whether the specified flag value is on.
178      */
179     private static boolean isOn(final long flag, final long options) {
180         return (options & flag) > 0;
181     }
182 
183     /**
184      * Holds the set of current validation options.
185      */
186     private final long options;
187 
188     /**
189      * The set of schemes that are allowed to be in a URL.
190      */
191     private final Set<String> allowedSchemes; // Must be lower-case
192 
193     /**
194      * Regular expressions used to manually validate authorities if IANA
195      * domain name validation isn't desired.
196      */
197     private final RegexValidator authorityValidator;
198 
199     private final DomainValidator domainValidator;
200 
201     /**
202      * Create a UrlValidator with default properties.
203      */
204     public UrlValidator() {
205         this(null);
206     }
207 
208     /**
209      * Initialize a UrlValidator with the given validation options.
210      * @param options The options should be set using the public constants declared in
211      * this class.  To set multiple options you simply add them together.  For example,
212      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
213      */
214     public UrlValidator(final long options) {
215         this(null, null, options);
216     }
217 
218     /**
219      * Initialize a UrlValidator with the given validation options.
220      * @param authorityValidator Regular expression validator used to validate the authority part
221      * This allows the user to override the standard set of domains.
222      * @param options Validation options. Set using the public constants of this class.
223      * To set multiple options, simply add them together:
224      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
225      * enables both of those options.
226      */
227     public UrlValidator(final RegexValidator authorityValidator, final long options) {
228         this(null, authorityValidator, options);
229     }
230 
231     /**
232      * Behavior of validation is modified by passing in several strings options:
233      * @param schemes Pass in one or more URL schemes to consider valid, passing in
234      *        a null will default to "http,https,ftp" being valid.
235      *        If a non-null schemes is specified then all valid schemes must
236      *        be specified. Setting the ALLOW_ALL_SCHEMES option will
237      *        ignore the contents of schemes.
238      */
239     public UrlValidator(final String[] schemes) {
240         this(schemes, 0L);
241     }
242 
243     /**
244      * Behavior of validation is modified by passing in options:
245      * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
246      * @param options The options should be set using the public constants declared in
247      * this class.  To set multiple options you simply add them together.  For example,
248      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
249      */
250     public UrlValidator(final String[] schemes, final long options) {
251         this(schemes, null, options);
252     }
253 
254     /**
255      * Customizable constructor. Validation behavior is modified by passing in options.
256      * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
257      * @param authorityValidator Regular expression validator used to validate the authority part
258      * @param options Validation options. Set using the public constants of this class.
259      * To set multiple options, simply add them together:
260      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
261      * enables both of those options.
262      */
263     public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) {
264         this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options)));
265     }
266 
267     /**
268      * Customizable constructor. Validation behavior is modified by passing in options.
269      * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
270      * @param authorityValidator Regular expression validator used to validate the authority part
271      * @param options Validation options. Set using the public constants of this class.
272      * To set multiple options, simply add them together:
273      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
274      * enables both of those options.
275      * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting
276      * @since 1.7
277      */
278     public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) {
279         this.options = options;
280         if (domainValidator == null) {
281             throw new IllegalArgumentException("DomainValidator must not be null");
282         }
283         if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) {
284             throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting");
285         }
286         this.domainValidator = domainValidator;
287 
288         if (isOn(ALLOW_ALL_SCHEMES)) {
289             allowedSchemes = Collections.emptySet();
290         } else {
291             if (schemes == null) {
292                 schemes = DEFAULT_SCHEMES;
293             }
294             allowedSchemes = new HashSet<>(schemes.length);
295             for (final String scheme : schemes) {
296                 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH));
297             }
298         }
299 
300         this.authorityValidator = authorityValidator;
301     }
302 
303     /**
304      * Returns the number of times the token appears in the target.
305      * @param token Token value to be counted.
306      * @param target Target value to count tokens in.
307      * @return the number of tokens.
308      */
309     protected int countToken(final String token, final String target) {
310         int tokenIndex = 0;
311         int count = 0;
312         while (tokenIndex != -1) {
313             tokenIndex = target.indexOf(token, tokenIndex);
314             if (tokenIndex > -1) {
315                 tokenIndex++;
316                 count++;
317             }
318         }
319         return count;
320     }
321 
322     /**
323      * Tests whether the given flag is off.  If the flag is not a power of 2
324      * (ie. 3) this tests whether the combination of flags is off.
325      *
326      * @param flag Flag value to check.
327      *
328      * @return whether the specified flag value is off.
329      */
330     private boolean isOff(final long flag) {
331         return (options & flag) == 0;
332     }
333 
334     /**
335      * Tests whether the given flag is on.  If the flag is not a power of 2
336      * (ie. 3) this tests whether the combination of flags is on.
337      *
338      * @param flag Flag value to check.
339      *
340      * @return whether the specified flag value is on.
341      */
342     private boolean isOn(final long flag) {
343         return (options & flag) > 0;
344     }
345 
346     /**
347      * <p>Checks if a field has a valid URL address.</p>
348      *
349      * Note that the method calls #isValidAuthority()
350      * which checks that the domain is valid.
351      *
352      * @param value The value validation is being performed on.  A <code>null</code>
353      * value is considered invalid.
354      * @return true if the URL is valid.
355      */
356     public boolean isValid(final String value) {
357         if (value == null) {
358             return false;
359         }
360 
361         URI uri; // ensure value is a valid URI
362         try {
363             uri = new URI(value);
364         } catch (final URISyntaxException e) {
365             return false;
366         }
367         // OK, perform additional validation
368 
369         final String scheme = uri.getScheme();
370         if (!isValidScheme(scheme)) {
371             return false;
372         }
373 
374         final String authority = uri.getRawAuthority();
375         if ("file".equals(scheme) && (authority == null || authority.isEmpty())) { // Special case - file: allows an empty authority
376             return true; // this is a local file - nothing more to do here
377         }
378         if ("file".equals(scheme) && authority != null && authority.contains(":")) {
379             return false;
380         }
381         // Validate the authority
382         if (!isValidAuthority(authority)) {
383             return false;
384         }
385 
386         if (!isValidPath(uri.getRawPath())) {
387             return false;
388         }
389 
390         if (!isValidQuery(uri.getRawQuery())) {
391             return false;
392         }
393 
394         if (!isValidFragment(uri.getRawFragment())) {
395             return false;
396         }
397 
398         return true;
399     }
400 
401     /**
402      * Returns true if the authority is properly formatted.  An authority is the combination
403      * of hostname and port.  A <code>null</code> authority value is considered invalid.
404      * Note: this implementation validates the domain unless a RegexValidator was provided.
405      * If a RegexValidator was supplied and it matches, then the authority is regarded
406      * as valid with no further checks, otherwise the method checks against the
407      * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
408      * @param authority Authority value to validate, alllows IDN
409      * @return true if authority (hostname and port) is valid.
410      */
411     protected boolean isValidAuthority(final String authority) {
412         if (authority == null) {
413             return false;
414         }
415 
416         // check manual authority validation if specified
417         if (authorityValidator != null && authorityValidator.isValid(authority)) {
418             return true;
419         }
420         // convert to ASCII if possible
421         final String authorityASCII = DomainValidator.unicodeToASCII(authority);
422 
423         final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
424         if (!authorityMatcher.matches()) {
425             return false;
426         }
427 
428         // We have to process IPV6 separately because that is parsed in a different group
429         final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
430         if (ipv6 != null) {
431             final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
432             if (!inetAddressValidator.isValidInet6Address(ipv6)) {
433                 return false;
434             }
435         } else {
436             final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
437             // check if authority is hostname or IP address:
438             // try a hostname first since that's much more likely
439             if (!this.domainValidator.isValid(hostLocation)) {
440                 // try an IPv4 address
441                 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
442                 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
443                     // isn't IPv4, so the URL is invalid
444                     return false;
445                 }
446             }
447             final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
448             if (port != null && !port.isEmpty()) {
449                 try {
450                     final int iPort = Integer.parseInt(port);
451                     if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) {
452                         return false;
453                     }
454                 } catch (final NumberFormatException nfe) {
455                     return false; // this can happen for big numbers
456                 }
457             }
458         }
459 
460         final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
461         if (extra != null && !extra.trim().isEmpty()) {
462             return false;
463         }
464 
465         return true;
466     }
467 
468     /**
469      * Returns true if the given fragment is null or fragments are allowed.
470      * @param fragment Fragment value to validate.
471      * @return true if fragment is valid.
472      */
473     protected boolean isValidFragment(final String fragment) {
474         if (fragment == null) {
475             return true;
476         }
477 
478         return isOff(NO_FRAGMENTS);
479     }
480 
481     /**
482      * Returns true if the path is valid.  A <code>null</code> value is considered invalid.
483      * @param path Path value to validate.
484      * @return true if path is valid.
485      */
486     protected boolean isValidPath(final String path) {
487         if (path == null) {
488             return false;
489         }
490 
491         if (!PATH_PATTERN.matcher(path).matches()) {
492             return false;
493         }
494 
495         try {
496             // Don't omit host otherwise leading path may be taken as host if it starts with //
497             final URI uri = new URI(null,"localhost",path,null);
498             final String norm = uri.normalize().getPath();
499             if (norm.startsWith("/../") // Trying to go via the parent dir
500              || norm.equals("/..")) {   // Trying to go to the parent dir
501                 return false;
502             }
503         } catch (final URISyntaxException e) {
504             return false;
505         }
506 
507         final int slash2Count = countToken("//", path);
508         if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
509             return false;
510         }
511 
512         return true;
513     }
514 
515     /**
516      * Returns true if the query is null or it's a properly formatted query string.
517      * @param query Query value to validate.
518      * @return true if query is valid.
519      */
520     protected boolean isValidQuery(final String query) {
521         if (query == null) {
522             return true;
523         }
524 
525         return QUERY_PATTERN.matcher(query).matches();
526     }
527 
528     /**
529      * Validate scheme. If schemes[] was initialized to a non null,
530      * then only those schemes are allowed.
531      * Otherwise the default schemes are "http", "https", "ftp".
532      * Matching is case-blind.
533      * @param scheme The scheme to validate.  A <code>null</code> value is considered
534      * invalid.
535      * @return true if valid.
536      */
537     protected boolean isValidScheme(final String scheme) {
538         if (scheme == null) {
539             return false;
540         }
541 
542         if (!SCHEME_PATTERN.matcher(scheme).matches()) {
543             return false;
544         }
545 
546         if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
547             return false;
548         }
549 
550         return true;
551     }
552 
553 }