View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.validator.routines;
18  
19  import java.io.Serializable;
20  import java.net.URI;
21  import java.net.URISyntaxException;
22  import java.util.Collections;
23  import java.util.HashSet;
24  import java.util.Locale;
25  import java.util.Set;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  /**
30   * <p><b>URL Validation</b> routines.</p>
31   * Behavior of validation is modified by passing in options:
32   * <ul>
33   * <li>ALLOW_2_SLASHES - [FALSE]  Allows double '/' characters in the path
34   * component.</li>
35   * <li>NO_FRAGMENT- [FALSE]  By default fragments are allowed, if this option is
36   * included then fragments are flagged as illegal.</li>
37   * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
38   * considered valid schemes.  Enabling this option will let any scheme pass validation.</li>
39   * </ul>
40   *
41   * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
42   * http://javascript.internet.com. However, this validation now bears little resemblance
43   * to the php original.</p>
44   * <pre>
45   *   Example of usage:
46   *   Construct a UrlValidator with valid schemes of "http", and "https".
47   *
48   *    String[] schemes = {"http","https"}.
49   *    UrlValidator urlValidator = new UrlValidator(schemes);
50   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
51   *       System.out.println("url is valid");
52   *    } else {
53   *       System.out.println("url is invalid");
54   *    }
55   *
56   *    prints "url is invalid"
57   *   If instead the default constructor is used.
58   *
59   *    UrlValidator urlValidator = new UrlValidator();
60   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
61   *       System.out.println("url is valid");
62   *    } else {
63   *       System.out.println("url is invalid");
64   *    }
65   *
66   *   prints out "url is valid"
67   *  </pre>
68   *
69   * @see
70   * <a href="http://www.ietf.org/rfc/rfc2396.txt">
71   *  Uniform Resource Identifiers (URI): Generic Syntax
72   * </a>
73   *
74   * @version $Revision: 1713573 $
75   * @since Validator 1.4
76   */
77  public class UrlValidator implements Serializable {
78  
79      private static final long serialVersionUID = 7557161713937335013L;
80  
81      /**
82       * Allows all validly formatted schemes to pass validation instead of
83       * supplying a set of valid schemes.
84       */
85      public static final long ALLOW_ALL_SCHEMES = 1 << 0;
86  
87      /**
88       * Allow two slashes in the path component of the URL.
89       */
90      public static final long ALLOW_2_SLASHES = 1 << 1;
91  
92      /**
93       * Enabling this options disallows any URL fragments.
94       */
95      public static final long NO_FRAGMENTS = 1 << 2;
96  
97      /**
98       * Allow local URLs, such as http://localhost/ or http://machine/ .
99       * This enables a broad-brush check, for complex local machine name
100      *  validation requirements you should create your validator with
101      *  a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
102      */
103     public static final long ALLOW_LOCAL_URLS = 1 << 3;
104 
105     /**
106      * This expression derived/taken from the BNF for URI (RFC2396).
107      */
108     private static final String URL_REGEX =
109             "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
110     //        12            3  4          5       6   7        8 9
111     private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
112 
113     /**
114      * Schema/Protocol (ie. http:, ftp:, file:, etc).
115      */
116     private static final int PARSE_URL_SCHEME = 2;
117 
118     /**
119      * Includes hostname/ip and port number.
120      */
121     private static final int PARSE_URL_AUTHORITY = 4;
122 
123     private static final int PARSE_URL_PATH = 5;
124 
125     private static final int PARSE_URL_QUERY = 7;
126 
127     private static final int PARSE_URL_FRAGMENT = 9;
128 
129     /**
130      * Protocol scheme (e.g. http, ftp, https).
131      */
132     private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
133     private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
134 
135     // Drop numeric, and  "+-." for now
136     // TODO does not allow for optional userinfo. 
137     // Validation of character set is done by isValidAuthority
138     private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
139     private static final String IPV6_REGEX = "[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix
140 
141     // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
142     // unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
143     // sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
144     // We assume that password has the same valid chars as user info
145     private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";
146     // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
147     private static final String USERINFO_FIELD_REGEX =
148             USERINFO_CHARS_REGEX + "+:" + // At least one character for the name
149             USERINFO_CHARS_REGEX + "*@"; // password may be absent
150     private static final String AUTHORITY_REGEX =
151             "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(:\\d*)?(.*)?";
152     //             1                          e.g. user:pass@          2                                   3       4
153     private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
154 
155     private static final int PARSE_AUTHORITY_IPV6 = 1;
156 
157     private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present
158 
159     // Not needed, because it is validated by AUTHORITY_REGEX
160 //    private static final int PARSE_AUTHORITY_PORT = 3;
161 
162     /**
163      * Should always be empty. The code currently allows spaces.
164      */
165     private static final int PARSE_AUTHORITY_EXTRA = 4;
166 
167     private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
168     private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
169 
170     private static final String QUERY_REGEX = "^(.*)$";
171     private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
172 
173     /**
174      * Holds the set of current validation options.
175      */
176     private final long options;
177 
178     /**
179      * The set of schemes that are allowed to be in a URL.
180      */
181     private final Set<String> allowedSchemes; // Must be lower-case
182 
183     /**
184      * Regular expressions used to manually validate authorities if IANA
185      * domain name validation isn't desired.
186      */
187     private final RegexValidator authorityValidator;
188 
189     /**
190      * If no schemes are provided, default to this set.
191      */
192     private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case
193 
194     /**
195      * Singleton instance of this class with default schemes and options.
196      */
197     private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
198 
199     /**
200      * Returns the singleton instance of this class with default schemes and options.
201      * @return singleton instance with default schemes and options
202      */
203     public static UrlValidator getInstance() {
204         return DEFAULT_URL_VALIDATOR;
205     }
206 
207     /**
208      * Create a UrlValidator with default properties.
209      */
210     public UrlValidator() {
211         this(null);
212     }
213 
214     /**
215      * Behavior of validation is modified by passing in several strings options:
216      * @param schemes Pass in one or more url schemes to consider valid, passing in
217      *        a null will default to "http,https,ftp" being valid.
218      *        If a non-null schemes is specified then all valid schemes must
219      *        be specified. Setting the ALLOW_ALL_SCHEMES option will
220      *        ignore the contents of schemes.
221      */
222     public UrlValidator(String[] schemes) {
223         this(schemes, 0L);
224     }
225 
226     /**
227      * Initialize a UrlValidator with the given validation options.
228      * @param options The options should be set using the public constants declared in
229      * this class.  To set multiple options you simply add them together.  For example,
230      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
231      */
232     public UrlValidator(long options) {
233         this(null, null, options);
234     }
235 
236     /**
237      * Behavior of validation is modified by passing in options:
238      * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
239      * @param options The options should be set using the public constants declared in
240      * this class.  To set multiple options you simply add them together.  For example,
241      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
242      */
243     public UrlValidator(String[] schemes, long options) {
244         this(schemes, null, options);
245     }
246 
247     /**
248      * Initialize a UrlValidator with the given validation options.
249      * @param authorityValidator Regular expression validator used to validate the authority part
250      * This allows the user to override the standard set of domains.
251      * @param options Validation options. Set using the public constants of this class.
252      * To set multiple options, simply add them together:
253      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
254      * enables both of those options.
255      */
256     public UrlValidator(RegexValidator authorityValidator, long options) {
257         this(null, authorityValidator, options);
258     }
259 
260     /**
261      * Customizable constructor. Validation behavior is modifed by passing in options.
262      * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
263      * @param authorityValidator Regular expression validator used to validate the authority part
264      * @param options Validation options. Set using the public constants of this class.
265      * To set multiple options, simply add them together:
266      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
267      * enables both of those options.
268      */
269     public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) {
270         this.options = options;
271 
272         if (isOn(ALLOW_ALL_SCHEMES)) {
273             allowedSchemes = Collections.emptySet();
274         } else {
275             if (schemes == null) {
276                 schemes = DEFAULT_SCHEMES;
277             }
278             allowedSchemes = new HashSet<String>(schemes.length);
279             for(int i=0; i < schemes.length; i++) {
280                 allowedSchemes.add(schemes[i].toLowerCase(Locale.ENGLISH));
281             }
282         }
283 
284         this.authorityValidator = authorityValidator;
285     }
286 
287     /**
288      * <p>Checks if a field has a valid url address.</p>
289      *
290      * Note that the method calls #isValidAuthority()
291      * which checks that the domain is valid.
292      *
293      * @param value The value validation is being performed on.  A <code>null</code>
294      * value is considered invalid.
295      * @return true if the url is valid.
296      */
297     public boolean isValid(String value) {
298         if (value == null) {
299             return false;
300         }
301 
302         // Check the whole url address structure
303         Matcher urlMatcher = URL_PATTERN.matcher(value);
304         if (!urlMatcher.matches()) {
305             return false;
306         }
307 
308         String scheme = urlMatcher.group(PARSE_URL_SCHEME);
309         if (!isValidScheme(scheme)) {
310             return false;
311         }
312 
313         String authority = urlMatcher.group(PARSE_URL_AUTHORITY);
314         if ("file".equals(scheme)) {// Special case - file: allows an empty authority
315             if (!"".equals(authority)) {
316                 if (authority.contains(":")) { // but cannot allow trailing :
317                     return false;
318                 }
319             }
320             // drop through to continue validation
321         } else { // not file:
322             // Validate the authority
323             if (!isValidAuthority(authority)) {
324                 return false;
325             }
326         }
327 
328         if (!isValidPath(urlMatcher.group(PARSE_URL_PATH))) {
329             return false;
330         }
331 
332         if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) {
333             return false;
334         }
335 
336         if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) {
337             return false;
338         }
339 
340         return true;
341     }
342 
343     /**
344      * Validate scheme. If schemes[] was initialized to a non null,
345      * then only those schemes are allowed.
346      * Otherwise the default schemes are "http", "https", "ftp".
347      * Matching is case-blind.
348      * @param scheme The scheme to validate.  A <code>null</code> value is considered
349      * invalid.
350      * @return true if valid.
351      */
352     protected boolean isValidScheme(String scheme) {
353         if (scheme == null) {
354             return false;
355         }
356 
357         // TODO could be removed if external schemes were checked in the ctor before being stored
358         if (!SCHEME_PATTERN.matcher(scheme).matches()) {
359             return false;
360         }
361 
362         if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
363             return false;
364         }
365 
366         return true;
367     }
368 
369     /**
370      * Returns true if the authority is properly formatted.  An authority is the combination
371      * of hostname and port.  A <code>null</code> authority value is considered invalid.
372      * Note: this implementation validates the domain unless a RegexValidator was provided.
373      * If a RegexValidator was supplied and it matches, then the authority is regarded
374      * as valid with no further checks, otherwise the method checks against the
375      * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
376      * @param authority Authority value to validate, alllows IDN
377      * @return true if authority (hostname and port) is valid.
378      */
379     protected boolean isValidAuthority(String authority) {
380         if (authority == null) {
381             return false;
382         }
383 
384         // check manual authority validation if specified
385         if (authorityValidator != null && authorityValidator.isValid(authority)) {
386             return true;
387         }
388         // convert to ASCII if possible
389         final String authorityASCII = DomainValidator.unicodeToASCII(authority);
390 
391         Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
392         if (!authorityMatcher.matches()) {
393             return false;
394         }
395 
396         // We have to process IPV6 separately because that is parsed in a different group
397         String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
398         if (ipv6 != null) {
399             InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
400                 if (!inetAddressValidator.isValidInet6Address(ipv6)) {
401                     return false;
402                 }
403         } else {
404             String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
405             // check if authority is hostname or IP address:
406             // try a hostname first since that's much more likely
407             DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS));
408             if (!domainValidator.isValid(hostLocation)) {
409                 // try an IPv4 address
410                 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
411                 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
412                     // isn't IPv4, so the URL is invalid
413                     return false;
414                 }
415             }
416         }
417 
418         String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
419         if (extra != null && extra.trim().length() > 0){
420             return false;
421         }
422 
423         return true;
424     }
425 
426     /**
427      * Returns true if the path is valid.  A <code>null</code> value is considered invalid.
428      * @param path Path value to validate.
429      * @return true if path is valid.
430      */
431     protected boolean isValidPath(String path) {
432         if (path == null) {
433             return false;
434         }
435 
436         if (!PATH_PATTERN.matcher(path).matches()) {
437             return false;
438         }
439 
440         try {
441             URI uri = new URI(null,null,path,null);
442             String norm = uri.normalize().getPath();
443             if (norm.startsWith("/../") // Trying to go via the parent dir 
444              || norm.equals("/..")) {   // Trying to go to the parent dir
445                 return false;
446             }
447         } catch (URISyntaxException e) {
448             return false;
449         }
450         
451         int slash2Count = countToken("//", path);
452         if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
453             return false;
454         }
455 
456         return true;
457     }
458 
459     /**
460      * Returns true if the query is null or it's a properly formatted query string.
461      * @param query Query value to validate.
462      * @return true if query is valid.
463      */
464     protected boolean isValidQuery(String query) {
465         if (query == null) {
466             return true;
467         }
468 
469         return QUERY_PATTERN.matcher(query).matches();
470     }
471 
472     /**
473      * Returns true if the given fragment is null or fragments are allowed.
474      * @param fragment Fragment value to validate.
475      * @return true if fragment is valid.
476      */
477     protected boolean isValidFragment(String fragment) {
478         if (fragment == null) {
479             return true;
480         }
481 
482         return isOff(NO_FRAGMENTS);
483     }
484 
485     /**
486      * Returns the number of times the token appears in the target.
487      * @param token Token value to be counted.
488      * @param target Target value to count tokens in.
489      * @return the number of tokens.
490      */
491     protected int countToken(String token, String target) {
492         int tokenIndex = 0;
493         int count = 0;
494         while (tokenIndex != -1) {
495             tokenIndex = target.indexOf(token, tokenIndex);
496             if (tokenIndex > -1) {
497                 tokenIndex++;
498                 count++;
499             }
500         }
501         return count;
502     }
503 
504     /**
505      * Tests whether the given flag is on.  If the flag is not a power of 2
506      * (ie. 3) this tests whether the combination of flags is on.
507      *
508      * @param flag Flag value to check.
509      *
510      * @return whether the specified flag value is on.
511      */
512     private boolean isOn(long flag) {
513         return (options & flag) > 0;
514     }
515 
516     /**
517      * Tests whether the given flag is off.  If the flag is not a power of 2
518      * (ie. 3) this tests whether the combination of flags is off.
519      *
520      * @param flag Flag value to check.
521      *
522      * @return whether the specified flag value is off.
523      */
524     private boolean isOff(long flag) {
525         return (options & flag) == 0;
526     }
527 
528     // Unit test access to pattern matcher
529     Matcher matchURL(String value) {
530         return URL_PATTERN.matcher(value);
531     }
532 }