UrlValidator.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.validator.routines;

  18. import java.io.Serializable;
  19. import java.net.URI;
  20. import java.net.URISyntaxException;
  21. import java.util.Collections;
  22. import java.util.HashSet;
  23. import java.util.Locale;
  24. import java.util.Set;
  25. import java.util.regex.Matcher;
  26. import java.util.regex.Pattern;

  27. import org.apache.commons.validator.GenericValidator;

  28. /**
  29.  * <p><b>URL Validation</b> routines.</p>
  30.  * Behavior of validation is modified by passing in options:
  31.  * <ul>
  32.  * <li>ALLOW_2_SLASHES - [FALSE]  Allows double '/' characters in the path
  33.  * component.</li>
  34.  * <li>NO_FRAGMENT- [FALSE]  By default fragments are allowed, if this option is
  35.  * included then fragments are flagged as illegal.</li>
  36.  * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
  37.  * considered valid schemes.  Enabling this option will let any scheme pass validation.</li>
  38.  * </ul>
  39.  *
  40.  * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
  41.  * https://javascript.internet.com. However, this validation now bears little resemblance
  42.  * to the php original.</p>
  43.  * <pre>
  44.  *   Example of usage:
  45.  *   Construct a UrlValidator with valid schemes of "http", and "https".
  46.  *
  47.  *    String[] schemes = {"http","https"}.
  48.  *    UrlValidator urlValidator = new UrlValidator(schemes);
  49.  *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
  50.  *       System.out.println("URL is valid");
  51.  *    } else {
  52.  *       System.out.println("URL is invalid");
  53.  *    }
  54.  *
  55.  *    prints "URL is invalid"
  56.  *   If instead the default constructor is used.
  57.  *
  58.  *    UrlValidator urlValidator = new UrlValidator();
  59.  *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
  60.  *       System.out.println("URL is valid");
  61.  *    } else {
  62.  *       System.out.println("URL is invalid");
  63.  *    }
  64.  *
  65.  *   prints out "URL is valid"
  66.  *  </pre>
  67.  *
  68.  * @see
  69.  * <a href="http://www.ietf.org/rfc/rfc2396.txt">
  70.  *  Uniform Resource Identifiers (URI): Generic Syntax
  71.  * </a>
  72.  *
  73.  * @since 1.4
  74.  */
  75. public class UrlValidator implements Serializable {

  76.     private static final long serialVersionUID = 7557161713937335013L;

  77.     private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max

  78.     /**
  79.      * Allows all validly formatted schemes to pass validation instead of
  80.      * supplying a set of valid schemes.
  81.      */
  82.     public static final long ALLOW_ALL_SCHEMES = 1 << 0;

  83.     /**
  84.      * Allow two slashes in the path component of the URL.
  85.      */
  86.     public static final long ALLOW_2_SLASHES = 1 << 1;

  87.     /**
  88.      * Enabling this options disallows any URL fragments.
  89.      */
  90.     public static final long NO_FRAGMENTS = 1 << 2;

  91.     /**
  92.      * Allow local URLs, such as https://localhost/ or https://machine/ .
  93.      * This enables a broad-brush check, for complex local machine name
  94.      *  validation requirements you should create your validator with
  95.      *  a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
  96.      */
  97.     public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber

  98.     /**
  99.      * Protocol scheme (e.g. http, ftp, https).
  100.      */
  101.     private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
  102.     private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);

  103.     // Drop numeric, and  "+-." for now
  104.     // TODO does not allow for optional userinfo.
  105.     // Validation of character set is done by isValidAuthority
  106.     private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
  107.     // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123
  108.     private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix

  109.     // userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
  110.     // unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
  111.     // sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
  112.     // We assume that password has the same valid chars as user info
  113.     private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";

  114.     // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
  115.     private static final String USERINFO_FIELD_REGEX =
  116.             USERINFO_CHARS_REGEX + "+" + // At least one character for the name
  117.             "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent

  118.     private static final String AUTHORITY_REGEX =
  119.             "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?";
  120.     //             1                                 e.g. user:pass@           2                                       3       4
  121.     private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);

  122.     private static final int PARSE_AUTHORITY_IPV6 = 1;

  123.     private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present

  124.     private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon

  125.     /**
  126.      * Should always be empty. The code currently allows spaces.
  127.      */
  128.     private static final int PARSE_AUTHORITY_EXTRA = 4;

  129.     private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
  130.     private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);

  131.     private static final String QUERY_REGEX = "^(\\S*)$";
  132.     private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);

  133.     /**
  134.      * If no schemes are provided, default to this set.
  135.      */
  136.     private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case

  137.     /**
  138.      * Singleton instance of this class with default schemes and options.
  139.      */
  140.     private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();

  141.     /**
  142.      * Returns the singleton instance of this class with default schemes and options.
  143.      * @return singleton instance with default schemes and options
  144.      */
  145.     public static UrlValidator getInstance() {
  146.         return DEFAULT_URL_VALIDATOR;
  147.     }

  148.     /**
  149.      * Tests whether the given flag is on.  If the flag is not a power of 2
  150.      * (e.g. 3) this tests whether the combination of flags is on.
  151.      *
  152.      * @param flag Flag value to check.
  153.      * @param options what to check
  154.      *
  155.      * @return whether the specified flag value is on.
  156.      */
  157.     private static boolean isOn(final long flag, final long options) {
  158.         return (options & flag) > 0;
  159.     }

  160.     /**
  161.      * Holds the set of current validation options.
  162.      */
  163.     private final long options;

  164.     /**
  165.      * The set of schemes that are allowed to be in a URL.
  166.      */
  167.     private final Set<String> allowedSchemes; // Must be lower-case

  168.     /**
  169.      * Regular expressions used to manually validate authorities if IANA
  170.      * domain name validation isn't desired.
  171.      */
  172.     private final RegexValidator authorityValidator;

  173.     private final DomainValidator domainValidator;

  174.     /**
  175.      * Create a UrlValidator with default properties.
  176.      */
  177.     public UrlValidator() {
  178.         this(null);
  179.     }

  180.     /**
  181.      * Initialize a UrlValidator with the given validation options.
  182.      * @param options The options should be set using the public constants declared in
  183.      * this class.  To set multiple options you simply add them together.  For example,
  184.      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
  185.      */
  186.     public UrlValidator(final long options) {
  187.         this(null, null, options);
  188.     }

  189.     /**
  190.      * Initialize a UrlValidator with the given validation options.
  191.      * @param authorityValidator Regular expression validator used to validate the authority part
  192.      * This allows the user to override the standard set of domains.
  193.      * @param options Validation options. Set using the public constants of this class.
  194.      * To set multiple options, simply add them together:
  195.      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
  196.      * enables both of those options.
  197.      */
  198.     public UrlValidator(final RegexValidator authorityValidator, final long options) {
  199.         this(null, authorityValidator, options);
  200.     }

  201.     /**
  202.      * Behavior of validation is modified by passing in several strings options:
  203.      * @param schemes Pass in one or more URL schemes to consider valid, passing in
  204.      *        a null will default to "http,https,ftp" being valid.
  205.      *        If a non-null schemes is specified then all valid schemes must
  206.      *        be specified. Setting the ALLOW_ALL_SCHEMES option will
  207.      *        ignore the contents of schemes.
  208.      */
  209.     public UrlValidator(final String[] schemes) {
  210.         this(schemes, 0L);
  211.     }

  212.     /**
  213.      * Behavior of validation is modified by passing in options:
  214.      * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
  215.      * @param options The options should be set using the public constants declared in
  216.      * this class.  To set multiple options you simply add them together.  For example,
  217.      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
  218.      */
  219.     public UrlValidator(final String[] schemes, final long options) {
  220.         this(schemes, null, options);
  221.     }

  222.     /**
  223.      * Customizable constructor. Validation behavior is modified by passing in options.
  224.      * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
  225.      * @param authorityValidator Regular expression validator used to validate the authority part
  226.      * @param options Validation options. Set using the public constants of this class.
  227.      * To set multiple options, simply add them together:
  228.      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
  229.      * enables both of those options.
  230.      */
  231.     public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) {
  232.         this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options)));
  233.     }

  234.     /**
  235.      * Customizable constructor. Validation behavior is modified by passing in options.
  236.      * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
  237.      * @param authorityValidator Regular expression validator used to validate the authority part
  238.      * @param options Validation options. Set using the public constants of this class.
  239.      * To set multiple options, simply add them together:
  240.      * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
  241.      * enables both of those options.
  242.      * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting
  243.      * @since 1.7
  244.      */
  245.     public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) {
  246.         this.options = options;
  247.         if (domainValidator == null) {
  248.             throw new IllegalArgumentException("DomainValidator must not be null");
  249.         }
  250.         if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) {
  251.             throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting");
  252.         }
  253.         this.domainValidator = domainValidator;

  254.         if (isOn(ALLOW_ALL_SCHEMES)) {
  255.             allowedSchemes = Collections.emptySet();
  256.         } else {
  257.             if (schemes == null) {
  258.                 schemes = DEFAULT_SCHEMES;
  259.             }
  260.             allowedSchemes = new HashSet<>(schemes.length);
  261.             for (final String scheme : schemes) {
  262.                 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH));
  263.             }
  264.         }

  265.         this.authorityValidator = authorityValidator;
  266.     }

  267.     /**
  268.      * Returns the number of times the token appears in the target.
  269.      * @param token Token value to be counted.
  270.      * @param target Target value to count tokens in.
  271.      * @return the number of tokens.
  272.      */
  273.     protected int countToken(final String token, final String target) {
  274.         int tokenIndex = 0;
  275.         int count = 0;
  276.         while (tokenIndex != -1) {
  277.             tokenIndex = target.indexOf(token, tokenIndex);
  278.             if (tokenIndex > -1) {
  279.                 tokenIndex++;
  280.                 count++;
  281.             }
  282.         }
  283.         return count;
  284.     }

  285.     /**
  286.      * Tests whether the given flag is off.  If the flag is not a power of 2
  287.      * (ie. 3) this tests whether the combination of flags is off.
  288.      *
  289.      * @param flag Flag value to check.
  290.      *
  291.      * @return whether the specified flag value is off.
  292.      */
  293.     private boolean isOff(final long flag) {
  294.         return (options & flag) == 0;
  295.     }

  296.     /**
  297.      * Tests whether the given flag is on.  If the flag is not a power of 2
  298.      * (ie. 3) this tests whether the combination of flags is on.
  299.      *
  300.      * @param flag Flag value to check.
  301.      *
  302.      * @return whether the specified flag value is on.
  303.      */
  304.     private boolean isOn(final long flag) {
  305.         return (options & flag) > 0;
  306.     }

  307.     /**
  308.      * <p>Checks if a field has a valid URL address.</p>
  309.      *
  310.      * Note that the method calls #isValidAuthority()
  311.      * which checks that the domain is valid.
  312.      *
  313.      * @param value The value validation is being performed on.  A {@code null}
  314.      * value is considered invalid.
  315.      * @return true if the URL is valid.
  316.      */
  317.     public boolean isValid(final String value) {
  318.         if (value == null) {
  319.             return false;
  320.         }

  321.         URI uri; // ensure value is a valid URI
  322.         try {
  323.             uri = new URI(value);
  324.         } catch (final URISyntaxException e) {
  325.             return false;
  326.         }
  327.         // OK, perform additional validation

  328.         final String scheme = uri.getScheme();
  329.         if (!isValidScheme(scheme)) {
  330.             return false;
  331.         }

  332.         final String authority = uri.getRawAuthority();
  333.         if ("file".equals(scheme) && GenericValidator.isBlankOrNull(authority)) { // Special case - file: allows an empty authority
  334.             return true; // this is a local file - nothing more to do here
  335.         }
  336.         if ("file".equals(scheme) && authority != null && authority.contains(":")) {
  337.             return false;
  338.         }
  339.         // Validate the authority
  340.         if (!isValidAuthority(authority)) {
  341.             return false;
  342.         }

  343.         if (!isValidPath(uri.getRawPath())) {
  344.             return false;
  345.         }

  346.         if (!isValidQuery(uri.getRawQuery())) {
  347.             return false;
  348.         }

  349.         if (!isValidFragment(uri.getRawFragment())) {
  350.             return false;
  351.         }

  352.         return true;
  353.     }

  354.     /**
  355.      * Returns true if the authority is properly formatted.  An authority is the combination
  356.      * of hostname and port.  A {@code null} authority value is considered invalid.
  357.      * Note: this implementation validates the domain unless a RegexValidator was provided.
  358.      * If a RegexValidator was supplied and it matches, then the authority is regarded
  359.      * as valid with no further checks, otherwise the method checks against the
  360.      * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
  361.      * @param authority Authority value to validate, alllows IDN
  362.      * @return true if authority (hostname and port) is valid.
  363.      */
  364.     protected boolean isValidAuthority(final String authority) {
  365.         if (authority == null) {
  366.             return false;
  367.         }

  368.         // check manual authority validation if specified
  369.         if (authorityValidator != null && authorityValidator.isValid(authority)) {
  370.             return true;
  371.         }
  372.         // convert to ASCII if possible
  373.         final String authorityASCII = DomainValidator.unicodeToASCII(authority);

  374.         final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
  375.         if (!authorityMatcher.matches()) {
  376.             return false;
  377.         }

  378.         // We have to process IPV6 separately because that is parsed in a different group
  379.         final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
  380.         if (ipv6 != null) {
  381.             final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
  382.             if (!inetAddressValidator.isValidInet6Address(ipv6)) {
  383.                 return false;
  384.             }
  385.         } else {
  386.             final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
  387.             // check if authority is hostname or IP address:
  388.             // try a hostname first since that's much more likely
  389.             if (!this.domainValidator.isValid(hostLocation)) {
  390.                 // try an IPv4 address
  391.                 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
  392.                 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
  393.                     // isn't IPv4, so the URL is invalid
  394.                     return false;
  395.                 }
  396.             }
  397.             final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
  398.             if (!GenericValidator.isBlankOrNull(port)) {
  399.                 try {
  400.                     final int iPort = Integer.parseInt(port);
  401.                     if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) {
  402.                         return false;
  403.                     }
  404.                 } catch (final NumberFormatException nfe) {
  405.                     return false; // this can happen for big numbers
  406.                 }
  407.             }
  408.         }

  409.         final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
  410.         if (extra != null && !extra.trim().isEmpty()) {
  411.             return false;
  412.         }

  413.         return true;
  414.     }

  415.     /**
  416.      * Returns true if the given fragment is null or fragments are allowed.
  417.      * @param fragment Fragment value to validate.
  418.      * @return true if fragment is valid.
  419.      */
  420.     protected boolean isValidFragment(final String fragment) {
  421.         if (fragment == null) {
  422.             return true;
  423.         }

  424.         return isOff(NO_FRAGMENTS);
  425.     }

  426.     /**
  427.      * Returns true if the path is valid.  A {@code null} value is considered invalid.
  428.      * @param path Path value to validate.
  429.      * @return true if path is valid.
  430.      */
  431.     protected boolean isValidPath(final String path) {
  432.         if (path == null) {
  433.             return false;
  434.         }

  435.         if (!PATH_PATTERN.matcher(path).matches()) {
  436.             return false;
  437.         }

  438.         try {
  439.             // Don't omit host otherwise leading path may be taken as host if it starts with //
  440.             final URI uri = new URI(null, "localhost", path, null);
  441.             final String norm = uri.normalize().getPath();
  442.             if (norm.startsWith("/../") // Trying to go via the parent dir
  443.                     || norm.equals("/..")) { // Trying to go to the parent dir
  444.                 return false;
  445.             }
  446.         } catch (final URISyntaxException e) {
  447.             return false;
  448.         }

  449.         final int slash2Count = countToken("//", path);
  450.         if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
  451.             return false;
  452.         }

  453.         return true;
  454.     }

  455.     /**
  456.      * Returns true if the query is null or it's a properly formatted query string.
  457.      * @param query Query value to validate.
  458.      * @return true if query is valid.
  459.      */
  460.     protected boolean isValidQuery(final String query) {
  461.         if (query == null) {
  462.             return true;
  463.         }
  464.         return QUERY_PATTERN.matcher(query).matches();
  465.     }

  466.     /**
  467.      * Validate scheme. If schemes[] was initialized to a non null,
  468.      * then only those schemes are allowed.
  469.      * Otherwise the default schemes are "http", "https", "ftp".
  470.      * Matching is case-blind.
  471.      * @param scheme The scheme to validate.  A {@code null} value is considered
  472.      * invalid.
  473.      * @return true if valid.
  474.      */
  475.     protected boolean isValidScheme(final String scheme) {
  476.         if (scheme == null) {
  477.             return false;
  478.         }

  479.         if (!SCHEME_PATTERN.matcher(scheme).matches()) {
  480.             return false;
  481.         }

  482.         if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
  483.             return false;
  484.         }

  485.         return true;
  486.     }

  487. }