UrlValidator.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.validator;

  18. import java.io.Serializable;
  19. import java.util.Arrays;
  20. import java.util.HashSet;
  21. import java.util.Set;
  22. import java.util.regex.Matcher;
  23. import java.util.regex.Pattern;

  24. import org.apache.commons.validator.routines.InetAddressValidator;
  25. import org.apache.commons.validator.util.Flags;

  26. /**
  27.  * <p>Validates URLs.</p>
  28.  * Behaviour of validation is modified by passing in options:
  29.  * <ul>
  30.  * <li>ALLOW_2_SLASHES - [FALSE]  Allows double '/' characters in the path
  31.  * component.</li>
  32.  * <li>NO_FRAGMENT- [FALSE]  By default fragments are allowed, if this option is
  33.  * included then fragments are flagged as illegal.</li>
  34.  * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
  35.  * considered valid schemes.  Enabling this option will let any scheme pass validation.</li>
  36.  * </ul>
  37.  *
  38.  * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
  39.  * https://javascript.internet.com. However, this validation now bears little resemblance
  40.  * to the php original.</p>
  41.  * <pre>
  42.  *   Example of usage:
  43.  *   Construct a UrlValidator with valid schemes of "http", and "https".
  44.  *
  45.  *    String[] schemes = {"http","https"}.
  46.  *    UrlValidator urlValidator = new UrlValidator(schemes);
  47.  *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
  48.  *       System.out.println("URL is valid");
  49.  *    } else {
  50.  *       System.out.println("URL is invalid");
  51.  *    }
  52.  *
  53.  *    prints "URL is invalid"
  54.  *   If instead the default constructor is used.
  55.  *
  56.  *    UrlValidator urlValidator = new UrlValidator();
  57.  *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
  58.  *       System.out.println("URL is valid");
  59.  *    } else {
  60.  *       System.out.println("URL is invalid");
  61.  *    }
  62.  *
  63.  *   prints out "URL is valid"
  64.  *  </pre>
  65.  *
  66.  * @see
  67.  * <a href="http://www.ietf.org/rfc/rfc2396.txt">
  68.  *  Uniform Resource Identifiers (URI): Generic Syntax
  69.  * </a>
  70.  *
  71.  * @since 1.1
  72.  * @deprecated Use the new UrlValidator in the routines package. This class
  73.  * will be removed in a future release.
  74.  */
  75. @Deprecated
  76. public class UrlValidator implements Serializable {

  77.     private static final int TOP_LEVEL_MAX_LEN = 4;

  78.     private static final int TOP_LEVEL_MIN_LEN = 2;

  79.     private static final long serialVersionUID = 24137157400029593L;

  80.     /**
  81.      * Allows all validly formatted schemes to pass validation instead of
  82.      * supplying a set of valid schemes.
  83.      */
  84.     public static final int ALLOW_ALL_SCHEMES = 1 << 0;

  85.     /**
  86.      * Allow two slashes in the path component of the URL.
  87.      */
  88.     public static final int ALLOW_2_SLASHES = 1 << 1;

  89.     /**
  90.      * Enabling this options disallows any URL fragments.
  91.      */
  92.     public static final int NO_FRAGMENTS = 1 << 2;

  93.     private static final String ALPHA_CHARS = "a-zA-Z";

  94. // NOT USED   private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";

  95.     private static final String SPECIAL_CHARS = ";/@&=,.?:+$";

  96.     private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";

  97.     // Drop numeric, and  "+-." for now
  98.     private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\.";

  99.     private static final String ATOM = VALID_CHARS + '+';

  100.     /**
  101.      * This expression derived/taken from the BNF for URI (RFC2396).
  102.      */
  103.     private static final String URL_REGEX =
  104.             "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
  105.     //                                                                      12            3  4          5       6   7        8 9
  106.     private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);

  107.     /**
  108.      * Schema/Protocol (ie. http:, ftp:, file:, etc).
  109.      */
  110.     private static final int PARSE_URL_SCHEME = 2;

  111.     /**
  112.      * Includes hostname/ip and port number.
  113.      */
  114.     private static final int PARSE_URL_AUTHORITY = 4;

  115.     private static final int PARSE_URL_PATH = 5;

  116.     private static final int PARSE_URL_QUERY = 7;

  117.     private static final int PARSE_URL_FRAGMENT = 9;

  118.     /**
  119.      * Protocol (for example, http:, ftp:, https:).
  120.      */
  121.     private static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*");

  122.     private static final String AUTHORITY_REGEX =
  123.        "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?";
  124.     //                                                                            1                          2  3       4
  125.     private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);

  126.     private static final int PARSE_AUTHORITY_HOST_IP = 1;

  127.     private static final int PARSE_AUTHORITY_PORT = 2;

  128.     /**
  129.      * Should always be empty.
  130.      */
  131.     private static final int PARSE_AUTHORITY_EXTRA = 3;

  132.     private static final Pattern PATH_PATTERN = Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$");

  133.     private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");

  134.     private static final Pattern LEGAL_ASCII_PATTERN = Pattern.compile("^\\p{ASCII}+$");

  135.     private static final Pattern DOMAIN_PATTERN =
  136.             Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");

  137.     private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");

  138.     private static final Pattern ATOM_PATTERN = Pattern.compile("^(" + ATOM + ").*?$");

  139.     private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + ALPHA_CHARS + "]");

  140.     /**
  141.      * Holds the set of current validation options.
  142.      */
  143.     private final Flags options;

  144.     /**
  145.      * The set of schemes that are allowed to be in a URL.
  146.      */
  147.     private final Set<String> allowedSchemes = new HashSet<>();

  148.     /**
  149.      * If no schemes are provided, default to this set.
  150.      */
  151.     protected String[] defaultSchemes = {"http", "https", "ftp"};

  152.     /**
  153.      * Create a UrlValidator with default properties.
  154.      */
  155.     public UrlValidator() {
  156.         this(null);
  157.     }

  158.     /**
  159.      * Initialize a UrlValidator with the given validation options.
  160.      * @param options The options should be set using the public constants declared in
  161.      * this class.  To set multiple options you simply add them together.  For example,
  162.      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
  163.      */
  164.     public UrlValidator(final int options) {
  165.         this(null, options);
  166.     }

  167.     /**
  168.      * Behavior of validation is modified by passing in several strings options:
  169.      * @param schemes Pass in one or more URL schemes to consider valid, passing in
  170.      *        a null will default to "http,https,ftp" being valid.
  171.      *        If a non-null schemes is specified then all valid schemes must
  172.      *        be specified. Setting the ALLOW_ALL_SCHEMES option will
  173.      *        ignore the contents of schemes.
  174.      */
  175.     public UrlValidator(final String[] schemes) {
  176.         this(schemes, 0);
  177.     }

  178.     /**
  179.      * Behaviour of validation is modified by passing in options:
  180.      * @param schemes The set of valid schemes.
  181.      * @param options The options should be set using the public constants declared in
  182.      * this class.  To set multiple options you simply add them together.  For example,
  183.      * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
  184.      */
  185.     public UrlValidator(String[] schemes, final int options) {
  186.         this.options = new Flags(options);

  187.         if (this.options.isOn(ALLOW_ALL_SCHEMES)) {
  188.             return;
  189.         }

  190.         if (schemes == null) {
  191.             schemes = defaultSchemes;
  192.         }

  193.         allowedSchemes.addAll(Arrays.asList(schemes));
  194.     }

  195.     /**
  196.      * Returns the number of times the token appears in the target.
  197.      * @param token Token value to be counted.
  198.      * @param target Target value to count tokens in.
  199.      * @return the number of tokens.
  200.      */
  201.     protected int countToken(final String token, final String target) {
  202.         int tokenIndex = 0;
  203.         int count = 0;
  204.         while (tokenIndex != -1) {
  205.             tokenIndex = target.indexOf(token, tokenIndex);
  206.             if (tokenIndex > -1) {
  207.                 tokenIndex++;
  208.                 count++;
  209.             }
  210.         }
  211.         return count;
  212.     }

  213.     /**
  214.      * <p>Checks if a field has a valid URL address.</p>
  215.      *
  216.      * @param value The value validation is being performed on.  A {@code null}
  217.      * value is considered invalid.
  218.      * @return true if the URL is valid.
  219.      */
  220.     public boolean isValid(final String value) {
  221.         if (value == null || !LEGAL_ASCII_PATTERN.matcher(value).matches()) {
  222.            return false;
  223.         }

  224.         // Check the whole url address structure
  225.         final Matcher urlMatcher = URL_PATTERN.matcher(value);
  226.         if (!urlMatcher.matches() || !isValidScheme(urlMatcher.group(PARSE_URL_SCHEME)) || !isValidAuthority(urlMatcher.group(PARSE_URL_AUTHORITY)) || !isValidPath(urlMatcher.group(PARSE_URL_PATH))) {
  227.             return false;
  228.         }

  229.         if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) {
  230.             return false;
  231.         }

  232.         if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) {
  233.             return false;
  234.         }

  235.         return true;
  236.     }

  237.     /**
  238.      * Returns true if the authority is properly formatted.  An authority is the combination
  239.      * of hostname and port.  A {@code null} authority value is considered invalid.
  240.      * @param authority Authority value to validate.
  241.      * @return true if authority (hostname and port) is valid.
  242.      */
  243.     protected boolean isValidAuthority(final String authority) {
  244.         if (authority == null) {
  245.             return false;
  246.         }

  247.         final InetAddressValidator inetAddressValidator =
  248.                 InetAddressValidator.getInstance();

  249.         final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
  250.         if (!authorityMatcher.matches()) {
  251.             return false;
  252.         }

  253.         boolean hostname = false;
  254.         // check if authority is IP address or hostname
  255.         String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
  256.         final boolean ipV4Address = inetAddressValidator.isValid(hostIP);

  257.         if (!ipV4Address) {
  258.             // Domain is hostname name
  259.             hostname = DOMAIN_PATTERN.matcher(hostIP).matches();
  260.         }

  261.         //rightmost hostname will never start with a digit.
  262.         if (hostname) {
  263.             // LOW-TECH FIX FOR VALIDATOR-202
  264.             // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
  265.             final char[] chars = hostIP.toCharArray();
  266.             int size = 1;
  267.             for (final char element : chars) {
  268.                 if (element == '.') {
  269.                     size++;
  270.                 }
  271.             }
  272.             final String[] domainSegment = new String[size];
  273.             boolean match = true;
  274.             int segmentCount = 0;
  275.             int segmentLength = 0;

  276.             while (match) {
  277.                 final Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP);
  278.                 match = atomMatcher.matches();
  279.                 if (match) {
  280.                     domainSegment[segmentCount] = atomMatcher.group(1);
  281.                     segmentLength = domainSegment[segmentCount].length() + 1;
  282.                     hostIP =
  283.                             segmentLength >= hostIP.length()
  284.                             ? ""
  285.                             : hostIP.substring(segmentLength);

  286.                     segmentCount++;
  287.                 }
  288.             }
  289.             final String topLevel = domainSegment[segmentCount - 1];


  290.             // First letter of top level must be an alpha
  291.             // Make sure there's a host name preceding the authority.
  292.             if (topLevel.length() < TOP_LEVEL_MIN_LEN || topLevel.length() > TOP_LEVEL_MAX_LEN || !ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()
  293.                     || segmentCount < 2) {
  294.                 return false;
  295.             }
  296.         }

  297.         if (!hostname && !ipV4Address) {
  298.             return false;
  299.         }

  300.         final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
  301.         if (port != null && !PORT_PATTERN.matcher(port).matches()) {
  302.             return false;
  303.         }

  304.         final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
  305.         if (!GenericValidator.isBlankOrNull(extra)) {
  306.             return false;
  307.         }

  308.         return true;
  309.     }

  310.     /**
  311.      * Returns true if the given fragment is null or fragments are allowed.
  312.      * @param fragment Fragment value to validate.
  313.      * @return true if fragment is valid.
  314.      */
  315.     protected boolean isValidFragment(final String fragment) {
  316.         if (fragment == null) {
  317.             return true;
  318.         }

  319.         return options.isOff(NO_FRAGMENTS);
  320.     }

  321.     /**
  322.      * Returns true if the path is valid.  A {@code null} value is considered invalid.
  323.      * @param path Path value to validate.
  324.      * @return true if path is valid.
  325.      */
  326.     protected boolean isValidPath(final String path) {
  327.         if (path == null || !PATH_PATTERN.matcher(path).matches()) {
  328.             return false;
  329.         }

  330.         final int slash2Count = countToken("//", path);
  331.         if (options.isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
  332.             return false;
  333.         }

  334.         final int slashCount = countToken("/", path);
  335.         final int dot2Count = countToken("..", path);
  336.         if (dot2Count > 0 && slashCount - slash2Count - 1 <= dot2Count) {
  337.             return false;
  338.         }

  339.         return true;
  340.     }

  341.     /**
  342.      * Returns true if the query is null, or it's a properly formatted query string.
  343.      * @param query Query value to validate.
  344.      * @return true if query is valid.
  345.      */
  346.     protected boolean isValidQuery(final String query) {
  347.         if (query == null) {
  348.             return true;
  349.         }

  350.         return QUERY_PATTERN.matcher(query).matches();
  351.     }

  352.     /**
  353.      * Validate scheme. If schemes[] was initialized to a non-null,
  354.      * then only those schemes are allowed.  Note this is slightly different
  355.      * than for the constructor.
  356.      * @param scheme The scheme to validate.  A {@code null} value is considered
  357.      * invalid.
  358.      * @return true if valid.
  359.      */
  360.     protected boolean isValidScheme(final String scheme) {
  361.         if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() || options.isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme)) {
  362.             return false;
  363.         }

  364.         return true;
  365.     }
  366. }