View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.validator.routines;
18  
19  import java.io.Serializable;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.HashSet;
23  import java.util.Set;
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  
27  /**
28   * <p><b>URL Validation</b> routines.</p>
29   * Behavior of validation is modified by passing in options:
30   * <li>ALLOW_2_SLASHES - [FALSE]  Allows double '/' characters in the path
31   * component.</li>
32   * <li>NO_FRAGMENT- [FALSE]  By default fragments are allowed, if this option is
33   * included then fragments are flagged as illegal.</li>
34   * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
35   * considered valid schemes.  Enabling this option will let any scheme pass validation.</li>
36   *
37   * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
38   * http://javascript.internet.com. However, this validation now bears little resemblance
39   * to the php original.</p>
40   * <pre>
41   *   Example of usage:
42   *   Construct a UrlValidator with valid schemes of "http", and "https".
43   *
44   *    String[] schemes = {"http","https"}.
45   *    UrlValidator urlValidator = new UrlValidator(schemes);
46   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
47   *       System.out.println("url is valid");
48   *    } else {
49   *       System.out.println("url is invalid");
50   *    }
51   *
52   *    prints "url is invalid"
53   *   If instead the default constructor is used.
54   *
55   *    UrlValidator urlValidator = new UrlValidator();
56   *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
57   *       System.out.println("url is valid");
58   *    } else {
59   *       System.out.println("url is invalid");
60   *    }
61   *
62   *   prints out "url is valid"
63   *  </pre>
64   *
65   * @see
66   * <a href="http://www.ietf.org/rfc/rfc2396.txt">
67   *  Uniform Resource Identifiers (URI): Generic Syntax
68   * </a>
69   *
70   * @version $Revision: 1227719 $ $Date: 2012-01-05 18:45:51 +0100 (Do, 05 Jan 2012) $
71   * @since Validator 1.4
72   */
73  public class UrlValidator implements Serializable {
74  
75      private static final long serialVersionUID = 7557161713937335013L;
76  
77      /**
78       * Allows all validly formatted schemes to pass validation instead of
79       * supplying a set of valid schemes.
80       */
81      public static final long ALLOW_ALL_SCHEMES = 1 << 0;
82  
83      /**
84       * Allow two slashes in the path component of the URL.
85       */
86      public static final long ALLOW_2_SLASHES = 1 << 1;
87  
88      /**
89       * Enabling this options disallows any URL fragments.
90       */
91      public static final long NO_FRAGMENTS = 1 << 2;
92  
93      /**
94       * Allow local URLs, such as http://localhost/ or http://machine/ .
95       * This enables a broad-brush check, for complex local machine name
96       *  validation requirements you should create your validator with
97       *  a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
98       */
99      public static final long ALLOW_LOCAL_URLS = 1 << 3;
100 
101     // Drop numeric, and  "+-." for now
102     private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\.";
103 
104     /**
105      * This expression derived/taken from the BNF for URI (RFC2396).
106      */
107     private static final String URL_REGEX =
108             "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
109     //                                                                      12            3  4          5       6   7        8 9
110     private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
111 
112     /**
113      * Schema/Protocol (ie. http:, ftp:, file:, etc).
114      */
115     private static final int PARSE_URL_SCHEME = 2;
116 
117     /**
118      * Includes hostname/ip and port number.
119      */
120     private static final int PARSE_URL_AUTHORITY = 4;
121 
122     private static final int PARSE_URL_PATH = 5;
123 
124     private static final int PARSE_URL_QUERY = 7;
125 
126     private static final int PARSE_URL_FRAGMENT = 9;
127 
128     /**
129      * Protocol (ie. http:, ftp:,https:).
130      */
131     private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
132     private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
133 
134     private static final String AUTHORITY_REGEX =
135             "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?";
136     //                                                                            1                          2  3       4
137     private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
138 
139     private static final int PARSE_AUTHORITY_HOST_IP = 1;
140 
141     private static final int PARSE_AUTHORITY_PORT = 2;
142 
143     /**
144      * Should always be empty.
145      */
146     private static final int PARSE_AUTHORITY_EXTRA = 3;
147 
148     private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
149     private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
150 
151     private static final String QUERY_REGEX = "^(.*)$";
152     private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
153 
154     private static final String LEGAL_ASCII_REGEX = "^\\p{ASCII}+$";
155     private static final Pattern ASCII_PATTERN = Pattern.compile(LEGAL_ASCII_REGEX);
156 
157     private static final String PORT_REGEX = "^:(\\d{1,5})$";
158     private static final Pattern PORT_PATTERN = Pattern.compile(PORT_REGEX);
159 
160     /**
161      * Holds the set of current validation options.
162      */
163     private final long options;
164 
165     /**
166      * The set of schemes that are allowed to be in a URL.
167      */
168     private final Set allowedSchemes;
169 
170     /**
171      * Regular expressions used to manually validate authorities if IANA
172      * domain name validation isn't desired.
173      */
174     private final RegexValidator authorityValidator;
175 
176     /**
177      * If no schemes are provided, default to this set.
178      */
179     private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"};
180 
181     /**
182      * Singleton instance of this class with default schemes and options.
183      */
184     private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
185 
186     /**
187      * Returns the singleton instance of this class with default schemes and options.
188      * @return singleton instance with default schemes and options
189      */
190     public static UrlValidator getInstance() {
191         return DEFAULT_URL_VALIDATOR;
192     }
193 
194     /**
195      * Create a UrlValidator with default properties.
196      */
197     public UrlValidator() {
198         this(null);
199     }
200 
201     /**
202      * Behavior of validation is modified by passing in several strings options:
203      * @param schemes Pass in one or more url schemes to consider valid, passing in
204      *        a null will default to "http,https,ftp" being valid.
205      *        If a non-null schemes is specified then all valid schemes must
206      *        be specified. Setting the ALLOW_ALL_SCHEMES option will
207      *        ignore the contents of schemes.
208      */
209     public UrlValidator(String[] schemes) {
210         this(schemes, 0L);
211     }
212 
213     /**
214      * Initialize a UrlValidator with the given validation options.
215      * @param options The options should be set using the public constants declared in
216      * this class.  To set multiple options you simply add them together.  For example,
217      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
218      */
219     public UrlValidator(long options) {
220         this(null, null, options);
221     }
222 
223     /**
224      * Behavior of validation is modified by passing in options:
225      * @param schemes The set of valid schemes.
226      * @param options The options should be set using the public constants declared in
227      * this class.  To set multiple options you simply add them together.  For example,
228      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
229      */
230     public UrlValidator(String[] schemes, long options) {
231         this(schemes, null, options);
232     }
233 
234     /**
235      * Initialize a UrlValidator with the given validation options.
236      * @param authorityValidator Regular expression validator used to validate the authority part
237      * @param options Validation options. Set using the public constants of this class.
238      * To set multiple options, simply add them together:
239      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
240      * enables both of those options.
241      */
242     public UrlValidator(RegexValidator authorityValidator, long options) {
243         this(null, authorityValidator, options);
244     }
245 
246     /**
247      * Customizable constructor. Validation behavior is modifed by passing in options.
248      * @param schemes the set of valid schemes
249      * @param authorityValidator Regular expression validator used to validate the authority part
250      * @param options Validation options. Set using the public constants of this class.
251      * To set multiple options, simply add them together:
252      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
253      * enables both of those options.
254      */
255     public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) {
256         this.options = options;
257 
258         if (isOn(ALLOW_ALL_SCHEMES)) {
259             this.allowedSchemes = Collections.EMPTY_SET;
260         } else {
261             if (schemes == null) {
262                 schemes = DEFAULT_SCHEMES;
263             }
264             this.allowedSchemes = new HashSet();
265             this.allowedSchemes.addAll(Arrays.asList(schemes));
266         }
267 
268         this.authorityValidator = authorityValidator;
269 
270     }
271 
272     /**
273      * <p>Checks if a field has a valid url address.</p>
274      *
275      * @param value The value validation is being performed on.  A <code>null</code>
276      * value is considered invalid.
277      * @return true if the url is valid.
278      */
279     public boolean isValid(String value) {
280         if (value == null) {
281             return false;
282         }
283 
284         if (!ASCII_PATTERN.matcher(value).matches()) {
285             return false;
286         }
287 
288         // Check the whole url address structure
289         Matcher urlMatcher = URL_PATTERN.matcher(value);
290         if (!urlMatcher.matches()) {
291             return false;
292         }
293 
294         String scheme = urlMatcher.group(PARSE_URL_SCHEME);
295         if (!isValidScheme(scheme)) {
296             return false;
297         }
298 
299         String authority = urlMatcher.group(PARSE_URL_AUTHORITY);
300         if ("file".equals(scheme) && "".equals(authority)) {
301            // Special case - file: allows an empty authority
302         } else {
303            // Validate the authority
304            if (!isValidAuthority(authority)) {
305                return false;
306             }
307         }
308 
309         if (!isValidPath(urlMatcher.group(PARSE_URL_PATH))) {
310             return false;
311         }
312 
313         if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) {
314             return false;
315         }
316 
317         if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) {
318             return false;
319         }
320 
321         return true;
322     }
323 
324     /**
325      * Validate scheme. If schemes[] was initialized to a non null,
326      * then only those scheme's are allowed.  Note this is slightly different
327      * than for the constructor.
328      * @param scheme The scheme to validate.  A <code>null</code> value is considered
329      * invalid.
330      * @return true if valid.
331      */
332     protected boolean isValidScheme(String scheme) {
333         if (scheme == null) {
334             return false;
335         }
336 
337         if (!SCHEME_PATTERN.matcher(scheme).matches()) {
338             return false;
339         }
340 
341         if (isOff(ALLOW_ALL_SCHEMES)) {
342 
343             if (!this.allowedSchemes.contains(scheme)) {
344                 return false;
345             }
346         }
347 
348         return true;
349     }
350 
351     /**
352      * Returns true if the authority is properly formatted.  An authority is the combination
353      * of hostname and port.  A <code>null</code> authority value is considered invalid.
354      * @param authority Authority value to validate.
355      * @return true if authority (hostname and port) is valid.
356      */
357     protected boolean isValidAuthority(String authority) {
358         if (authority == null) {
359             return false;
360         }
361 
362         // check manual authority validation if specified
363         if (authorityValidator != null) {
364             if (authorityValidator.isValid(authority)) {
365                 return true;
366             }
367         }
368 
369         Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
370         if (!authorityMatcher.matches()) {
371             return false;
372         }
373 
374         String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
375         // check if authority is hostname or IP address:
376         // try a hostname first since that's much more likely
377         DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS));
378         if (!domainValidator.isValid(hostLocation)) {
379             // try an IP address
380             InetAddressValidator inetAddressValidator =
381                 InetAddressValidator.getInstance();
382             if (!inetAddressValidator.isValid(hostLocation)) {
383                 // isn't either one, so the URL is invalid
384                 return false;
385             }
386         }
387 
388         String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
389         if (port != null) {
390             if (!PORT_PATTERN.matcher(port).matches()) {
391                 return false;
392             }
393         }
394 
395         String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
396         if (extra != null && extra.trim().length() > 0){
397             return false;
398         }
399 
400         return true;
401     }
402 
403     /**
404      * Returns true if the path is valid.  A <code>null</code> value is considered invalid.
405      * @param path Path value to validate.
406      * @return true if path is valid.
407      */
408     protected boolean isValidPath(String path) {
409         if (path == null) {
410             return false;
411         }
412 
413         if (!PATH_PATTERN.matcher(path).matches()) {
414             return false;
415         }
416 
417         int slash2Count = countToken("//", path);
418         if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
419             return false;
420         }
421 
422         int slashCount = countToken("/", path);
423         int dot2Count = countToken("..", path);
424         if (dot2Count > 0) {
425             if ((slashCount - slash2Count - 1) <= dot2Count) {
426                 return false;
427             }
428         }
429 
430         return true;
431     }
432 
433     /**
434      * Returns true if the query is null or it's a properly formatted query string.
435      * @param query Query value to validate.
436      * @return true if query is valid.
437      */
438     protected boolean isValidQuery(String query) {
439         if (query == null) {
440             return true;
441         }
442 
443         return QUERY_PATTERN.matcher(query).matches();
444     }
445 
446     /**
447      * Returns true if the given fragment is null or fragments are allowed.
448      * @param fragment Fragment value to validate.
449      * @return true if fragment is valid.
450      */
451     protected boolean isValidFragment(String fragment) {
452         if (fragment == null) {
453             return true;
454         }
455 
456         return isOff(NO_FRAGMENTS);
457     }
458 
459     /**
460      * Returns the number of times the token appears in the target.
461      * @param token Token value to be counted.
462      * @param target Target value to count tokens in.
463      * @return the number of tokens.
464      */
465     protected int countToken(String token, String target) {
466         int tokenIndex = 0;
467         int count = 0;
468         while (tokenIndex != -1) {
469             tokenIndex = target.indexOf(token, tokenIndex);
470             if (tokenIndex > -1) {
471                 tokenIndex++;
472                 count++;
473             }
474         }
475         return count;
476     }
477 
478     /**
479      * Tests whether the given flag is on.  If the flag is not a power of 2
480      * (ie. 3) this tests whether the combination of flags is on.
481      *
482      * @param flag Flag value to check.
483      *
484      * @return whether the specified flag value is on.
485      */
486     private boolean isOn(long flag) {
487         return (this.options & flag) > 0;
488     }
489 
490     /**
491      * Tests whether the given flag is off.  If the flag is not a power of 2
492      * (ie. 3) this tests whether the combination of flags is off.
493      *
494      * @param flag Flag value to check.
495      *
496      * @return whether the specified flag value is off.
497      */
498     private boolean isOff(long flag) {
499         return (this.options & flag) == 0;
500     }
501 }