View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.validator.routines;
18  
19  import java.io.Serializable;
20  import java.net.URI;
21  import java.net.URISyntaxException;
22  import java.util.Collections;
23  import java.util.HashSet;
24  import java.util.Locale;
25  import java.util.Set;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  /**
30   * <p><b>URL Validation</b> routines.</p>
31   * Behavior of validation is modified by passing in options:
32   * <ul>
33   * <li>ALLOW_2_SLASHES - [FALSE]  Allows double '/' characters in the path
34   * component.</li>
35   * <li>NO_FRAGMENT- [FALSE]  By default fragments are allowed, if this option is
36   * included then fragments are flagged as illegal.</li>
37   * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
38   * considered valid schemes.  Enabling this option will let any scheme pass validation.</li>
39   * </ul>
40   *
41   * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
42   * http://javascript.internet.com. However, this validation now bears little resemblance
43   * to the php original.</p>
44   * <pre>
45   *   Example of usage:
46   *   Construct a UrlValidator with valid schemes of "http", and "https".
47   *
48   *    String[] schemes = {"http","https"}.
49   *    UrlValidator urlValidator = new UrlValidator(schemes);
50   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
51   *       System.out.println("url is valid");
52   *    } else {
53   *       System.out.println("url is invalid");
54   *    }
55   *
56   *    prints "url is invalid"
57   *   If instead the default constructor is used.
58   *
59   *    UrlValidator urlValidator = new UrlValidator();
60   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
61   *       System.out.println("url is valid");
62   *    } else {
63   *       System.out.println("url is invalid");
64   *    }
65   *
66   *   prints out "url is valid"
67   *  </pre>
68   *
69   * @see
70   * <a href="http://www.ietf.org/rfc/rfc2396.txt">
71   *  Uniform Resource Identifiers (URI): Generic Syntax
72   * </a>
73   *
74   * @version $Revision: 1783203 $
75   * @since Validator 1.4
76   */
77  public class UrlValidator implements Serializable {
78  
79      private static final long serialVersionUID = 7557161713937335013L;
80  
81      private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max
82  
83      /**
84       * Allows all validly formatted schemes to pass validation instead of
85       * supplying a set of valid schemes.
86       */
87      public static final long ALLOW_ALL_SCHEMES = 1 << 0;
88  
89      /**
90       * Allow two slashes in the path component of the URL.
91       */
92      public static final long ALLOW_2_SLASHES = 1 << 1;
93  
94      /**
95       * Enabling this options disallows any URL fragments.
96       */
97      public static final long NO_FRAGMENTS = 1 << 2;
98  
99      /**
100      * Allow local URLs, such as http://localhost/ or http://machine/ .
101      * This enables a broad-brush check, for complex local machine name
102      *  validation requirements you should create your validator with
103      *  a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
104      */
105     public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber
106 
107     /**
108      * This expression derived/taken from the BNF for URI (RFC2396).
109      */
110     private static final String URL_REGEX =
111             "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
112     //        12            3  4          5       6   7        8 9
113     private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
114 
115     /**
116      * Schema/Protocol (ie. http:, ftp:, file:, etc).
117      */
118     private static final int PARSE_URL_SCHEME = 2;
119 
120     /**
121      * Includes hostname/ip and port number.
122      */
123     private static final int PARSE_URL_AUTHORITY = 4;
124 
125     private static final int PARSE_URL_PATH = 5;
126 
127     private static final int PARSE_URL_QUERY = 7;
128 
129     private static final int PARSE_URL_FRAGMENT = 9;
130 
131     /**
132      * Protocol scheme (e.g. http, ftp, https).
133      */
134     private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
135     private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
136 
137     // Drop numeric, and  "+-." for now
138     // TODO does not allow for optional userinfo. 
139     // Validation of character set is done by isValidAuthority
140     private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
141     private static final String IPV6_REGEX = "[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix
142 
143     // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
144     // unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
145     // sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
146     // We assume that password has the same valid chars as user info
147     private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";
148     // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
149     private static final String USERINFO_FIELD_REGEX =
150             USERINFO_CHARS_REGEX + "+" + // At least one character for the name
151             "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent
152     private static final String AUTHORITY_REGEX =
153             "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?";
154     //             1                          e.g. user:pass@          2                                         3       4
155     private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
156 
157     private static final int PARSE_AUTHORITY_IPV6 = 1;
158 
159     private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present
160 
161     private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon
162 
163     /**
164      * Should always be empty. The code currently allows spaces.
165      */
166     private static final int PARSE_AUTHORITY_EXTRA = 4;
167 
168     private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
169     private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
170 
171     private static final String QUERY_REGEX = "^(\\S*)$";
172     private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
173 
174     /**
175      * Holds the set of current validation options.
176      */
177     private final long options;
178 
179     /**
180      * The set of schemes that are allowed to be in a URL.
181      */
182     private final Set<String> allowedSchemes; // Must be lower-case
183 
184     /**
185      * Regular expressions used to manually validate authorities if IANA
186      * domain name validation isn't desired.
187      */
188     private final RegexValidator authorityValidator;
189 
190     /**
191      * If no schemes are provided, default to this set.
192      */
193     private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case
194 
195     /**
196      * Singleton instance of this class with default schemes and options.
197      */
198     private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
199 
200     /**
201      * Returns the singleton instance of this class with default schemes and options.
202      * @return singleton instance with default schemes and options
203      */
204     public static UrlValidator getInstance() {
205         return DEFAULT_URL_VALIDATOR;
206     }
207 
208     /**
209      * Create a UrlValidator with default properties.
210      */
211     public UrlValidator() {
212         this(null);
213     }
214 
215     /**
216      * Behavior of validation is modified by passing in several strings options:
217      * @param schemes Pass in one or more url schemes to consider valid, passing in
218      *        a null will default to "http,https,ftp" being valid.
219      *        If a non-null schemes is specified then all valid schemes must
220      *        be specified. Setting the ALLOW_ALL_SCHEMES option will
221      *        ignore the contents of schemes.
222      */
223     public UrlValidator(String[] schemes) {
224         this(schemes, 0L);
225     }
226 
227     /**
228      * Initialize a UrlValidator with the given validation options.
229      * @param options The options should be set using the public constants declared in
230      * this class.  To set multiple options you simply add them together.  For example,
231      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
232      */
233     public UrlValidator(long options) {
234         this(null, null, options);
235     }
236 
237     /**
238      * Behavior of validation is modified by passing in options:
239      * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
240      * @param options The options should be set using the public constants declared in
241      * this class.  To set multiple options you simply add them together.  For example,
242      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
243      */
244     public UrlValidator(String[] schemes, long options) {
245         this(schemes, null, options);
246     }
247 
248     /**
249      * Initialize a UrlValidator with the given validation options.
250      * @param authorityValidator Regular expression validator used to validate the authority part
251      * This allows the user to override the standard set of domains.
252      * @param options Validation options. Set using the public constants of this class.
253      * To set multiple options, simply add them together:
254      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
255      * enables both of those options.
256      */
257     public UrlValidator(RegexValidator authorityValidator, long options) {
258         this(null, authorityValidator, options);
259     }
260 
261     /**
262      * Customizable constructor. Validation behavior is modifed by passing in options.
263      * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
264      * @param authorityValidator Regular expression validator used to validate the authority part
265      * @param options Validation options. Set using the public constants of this class.
266      * To set multiple options, simply add them together:
267      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
268      * enables both of those options.
269      */
270     public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) {
271         this.options = options;
272 
273         if (isOn(ALLOW_ALL_SCHEMES)) {
274             allowedSchemes = Collections.emptySet();
275         } else {
276             if (schemes == null) {
277                 schemes = DEFAULT_SCHEMES;
278             }
279             allowedSchemes = new HashSet<String>(schemes.length);
280             for(int i=0; i < schemes.length; i++) {
281                 allowedSchemes.add(schemes[i].toLowerCase(Locale.ENGLISH));
282             }
283         }
284 
285         this.authorityValidator = authorityValidator;
286     }
287 
288     /**
289      * <p>Checks if a field has a valid url address.</p>
290      *
291      * Note that the method calls #isValidAuthority()
292      * which checks that the domain is valid.
293      *
294      * @param value The value validation is being performed on.  A <code>null</code>
295      * value is considered invalid.
296      * @return true if the url is valid.
297      */
298     public boolean isValid(String value) {
299         if (value == null) {
300             return false;
301         }
302 
303         // Check the whole url address structure
304         Matcher urlMatcher = URL_PATTERN.matcher(value);
305         if (!urlMatcher.matches()) {
306             return false;
307         }
308 
309         String scheme = urlMatcher.group(PARSE_URL_SCHEME);
310         if (!isValidScheme(scheme)) {
311             return false;
312         }
313 
314         String authority = urlMatcher.group(PARSE_URL_AUTHORITY);
315         if ("file".equals(scheme)) {// Special case - file: allows an empty authority
316             if (authority != null) {
317                 if (authority.contains(":")) { // but cannot allow trailing :
318                     return false;
319                 }
320             }
321             // drop through to continue validation
322         } else { // not file:
323             // Validate the authority
324             if (!isValidAuthority(authority)) {
325                 return false;
326             }
327         }
328 
329         if (!isValidPath(urlMatcher.group(PARSE_URL_PATH))) {
330             return false;
331         }
332 
333         if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) {
334             return false;
335         }
336 
337         if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) {
338             return false;
339         }
340 
341         return true;
342     }
343 
344     /**
345      * Validate scheme. If schemes[] was initialized to a non null,
346      * then only those schemes are allowed.
347      * Otherwise the default schemes are "http", "https", "ftp".
348      * Matching is case-blind.
349      * @param scheme The scheme to validate.  A <code>null</code> value is considered
350      * invalid.
351      * @return true if valid.
352      */
353     protected boolean isValidScheme(String scheme) {
354         if (scheme == null) {
355             return false;
356         }
357 
358         // TODO could be removed if external schemes were checked in the ctor before being stored
359         if (!SCHEME_PATTERN.matcher(scheme).matches()) {
360             return false;
361         }
362 
363         if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
364             return false;
365         }
366 
367         return true;
368     }
369 
370     /**
371      * Returns true if the authority is properly formatted.  An authority is the combination
372      * of hostname and port.  A <code>null</code> authority value is considered invalid.
373      * Note: this implementation validates the domain unless a RegexValidator was provided.
374      * If a RegexValidator was supplied and it matches, then the authority is regarded
375      * as valid with no further checks, otherwise the method checks against the
376      * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
377      * @param authority Authority value to validate, alllows IDN
378      * @return true if authority (hostname and port) is valid.
379      */
380     protected boolean isValidAuthority(String authority) {
381         if (authority == null) {
382             return false;
383         }
384 
385         // check manual authority validation if specified
386         if (authorityValidator != null && authorityValidator.isValid(authority)) {
387             return true;
388         }
389         // convert to ASCII if possible
390         final String authorityASCII = DomainValidator.unicodeToASCII(authority);
391 
392         Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
393         if (!authorityMatcher.matches()) {
394             return false;
395         }
396 
397         // We have to process IPV6 separately because that is parsed in a different group
398         String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
399         if (ipv6 != null) {
400             InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
401                 if (!inetAddressValidator.isValidInet6Address(ipv6)) {
402                     return false;
403                 }
404         } else {
405             String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
406             // check if authority is hostname or IP address:
407             // try a hostname first since that's much more likely
408             DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS));
409             if (!domainValidator.isValid(hostLocation)) {
410                 // try an IPv4 address
411                 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
412                 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
413                     // isn't IPv4, so the URL is invalid
414                     return false;
415                 }
416             }
417             String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
418             if (port != null && port.length() > 0) {
419                 try {
420                     int iPort = Integer.parseInt(port);
421                     if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) {
422                         return false;
423                     }
424                 } catch (NumberFormatException nfe) {
425                     return false; // this can happen for big numbers
426                 }
427             }
428         }
429 
430         String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
431         if (extra != null && extra.trim().length() > 0){
432             return false;
433         }
434 
435         return true;
436     }
437 
438     /**
439      * Returns true if the path is valid.  A <code>null</code> value is considered invalid.
440      * @param path Path value to validate.
441      * @return true if path is valid.
442      */
443     protected boolean isValidPath(String path) {
444         if (path == null) {
445             return false;
446         }
447 
448         if (!PATH_PATTERN.matcher(path).matches()) {
449             return false;
450         }
451 
452         try {
453             URI uri = new URI(null,null,path,null);
454             String norm = uri.normalize().getPath();
455             if (norm.startsWith("/../") // Trying to go via the parent dir 
456              || norm.equals("/..")) {   // Trying to go to the parent dir
457                 return false;
458             }
459         } catch (URISyntaxException e) {
460             return false;
461         }
462         
463         int slash2Count = countToken("//", path);
464         if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
465             return false;
466         }
467 
468         return true;
469     }
470 
471     /**
472      * Returns true if the query is null or it's a properly formatted query string.
473      * @param query Query value to validate.
474      * @return true if query is valid.
475      */
476     protected boolean isValidQuery(String query) {
477         if (query == null) {
478             return true;
479         }
480 
481         return QUERY_PATTERN.matcher(query).matches();
482     }
483 
484     /**
485      * Returns true if the given fragment is null or fragments are allowed.
486      * @param fragment Fragment value to validate.
487      * @return true if fragment is valid.
488      */
489     protected boolean isValidFragment(String fragment) {
490         if (fragment == null) {
491             return true;
492         }
493 
494         return isOff(NO_FRAGMENTS);
495     }
496 
497     /**
498      * Returns the number of times the token appears in the target.
499      * @param token Token value to be counted.
500      * @param target Target value to count tokens in.
501      * @return the number of tokens.
502      */
503     protected int countToken(String token, String target) {
504         int tokenIndex = 0;
505         int count = 0;
506         while (tokenIndex != -1) {
507             tokenIndex = target.indexOf(token, tokenIndex);
508             if (tokenIndex > -1) {
509                 tokenIndex++;
510                 count++;
511             }
512         }
513         return count;
514     }
515 
516     /**
517      * Tests whether the given flag is on.  If the flag is not a power of 2
518      * (ie. 3) this tests whether the combination of flags is on.
519      *
520      * @param flag Flag value to check.
521      *
522      * @return whether the specified flag value is on.
523      */
524     private boolean isOn(long flag) {
525         return (options & flag) > 0;
526     }
527 
528     /**
529      * Tests whether the given flag is off.  If the flag is not a power of 2
530      * (ie. 3) this tests whether the combination of flags is off.
531      *
532      * @param flag Flag value to check.
533      *
534      * @return whether the specified flag value is off.
535      */
536     private boolean isOff(long flag) {
537         return (options & flag) == 0;
538     }
539 
540     // Unit test access to pattern matcher
541     Matcher matchURL(String value) {
542         return URL_PATTERN.matcher(value);
543     }
544 }