1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.validator;
18
19 import java.io.Serializable;
20 import java.util.Arrays;
21 import java.util.HashSet;
22 import java.util.Set;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25
26 import org.apache.commons.validator.routines.InetAddressValidator;
27 import org.apache.commons.validator.util.Flags;
28
29 /**
30 * <p>Validates URLs.</p>
31 * Behaviour of validation is modified by passing in options:
32 * <ul>
33 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
34 * component.</li>
35 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
36 * included then fragments are flagged as illegal.</li>
37 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
38 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
39 * </ul>
40 *
41 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
42 * https://javascript.internet.com. However, this validation now bears little resemblance
43 * to the php original.</p>
44 * <pre>
45 * Example of usage:
46 * Construct a UrlValidator with valid schemes of "http", and "https".
47 *
48 * String[] schemes = {"http","https"}.
49 * UrlValidator urlValidator = new UrlValidator(schemes);
50 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
51 * System.out.println("URL is valid");
52 * } else {
53 * System.out.println("URL is invalid");
54 * }
55 *
56 * prints "URL is invalid"
57 * If instead the default constructor is used.
58 *
59 * UrlValidator urlValidator = new UrlValidator();
60 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
61 * System.out.println("URL is valid");
62 * } else {
63 * System.out.println("URL is invalid");
64 * }
65 *
66 * prints out "URL is valid"
67 * </pre>
68 *
69 * @see
70 * <a href="https://www.ietf.org/rfc/rfc2396.txt">
71 * Uniform Resource Identifiers (URI): Generic Syntax
72 * </a>
73 *
74 * @since 1.1
75 * @deprecated Use the new UrlValidator in the routines package. This class
76 * will be removed in a future release.
77 */
78 @Deprecated
79 public class UrlValidator implements Serializable {
80
81 private static final int TOP_LEVEL_MAX_LEN = 4;
82
83 private static final int TOP_LEVEL_MIN_LEN = 2;
84
85 private static final long serialVersionUID = 24137157400029593L;
86
87 /**
88 * Allows all validly formatted schemes to pass validation instead of
89 * supplying a set of valid schemes.
90 */
91 public static final int ALLOW_ALL_SCHEMES = 1 << 0;
92
93 /**
94 * Allow two slashes in the path component of the URL.
95 */
96 public static final int ALLOW_2_SLASHES = 1 << 1;
97
98 /**
99 * Enabling this options disallows any URL fragments.
100 */
101 public static final int NO_FRAGMENTS = 1 << 2;
102
103 private static final String ALPHA_CHARS = "a-zA-Z";
104
105 // NOT USED private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
106
107 private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
108
109 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
110
111 // Drop numeric, and "+-." for now
112 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\.";
113
114 private static final String ATOM = VALID_CHARS + '+';
115
116 /**
117 * This expression derived/taken from the BNF for URI (RFC2396).
118 */
119 private static final String URL_REGEX =
120 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
121 // 12 3 4 5 6 7 8 9
122 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
123
124 /**
125 * Schema/Protocol (ie. http:, ftp:, file:, etc).
126 */
127 private static final int PARSE_URL_SCHEME = 2;
128
129 /**
130 * Includes hostname/ip and port number.
131 */
132 private static final int PARSE_URL_AUTHORITY = 4;
133
134 private static final int PARSE_URL_PATH = 5;
135
136 private static final int PARSE_URL_QUERY = 7;
137
138 private static final int PARSE_URL_FRAGMENT = 9;
139
140 /**
141 * Protocol (for example, http:, ftp:, https:).
142 */
143 private static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*");
144
145 private static final String AUTHORITY_REGEX =
146 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?";
147 // 1 2 3 4
148 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
149
150 private static final int PARSE_AUTHORITY_HOST_IP = 1;
151
152 private static final int PARSE_AUTHORITY_PORT = 2;
153
154 /**
155 * Should always be empty.
156 */
157 private static final int PARSE_AUTHORITY_EXTRA = 3;
158
159 private static final Pattern PATH_PATTERN = Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$");
160
161 private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
162
163 private static final Pattern LEGAL_ASCII_PATTERN = Pattern.compile("^\\p{ASCII}+$");
164
165 private static final Pattern DOMAIN_PATTERN =
166 Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");
167
168 private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
169
170 private static final Pattern ATOM_PATTERN = Pattern.compile("^(" + ATOM + ").*?$");
171
172 private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + ALPHA_CHARS + "]");
173
174 /**
175 * Holds the set of current validation options.
176 */
177 private final Flags options;
178
179 /**
180 * The set of schemes that are allowed to be in a URL.
181 */
182 private final Set<String> allowedSchemes = new HashSet<>();
183
184 /**
185 * If no schemes are provided, default to this set.
186 */
187 protected String[] defaultSchemes = {"http", "https", "ftp"};
188
189 /**
190 * Create a UrlValidator with default properties.
191 */
192 public UrlValidator() {
193 this(null);
194 }
195
196 /**
197 * Initialize a UrlValidator with the given validation options.
198 * @param options The options should be set using the public constants declared in
199 * this class. To set multiple options you simply add them together. For example,
200 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
201 */
202 public UrlValidator(final int options) {
203 this(null, options);
204 }
205
206 /**
207 * Behavior of validation is modified by passing in several strings options:
208 * @param schemes Pass in one or more URL schemes to consider valid, passing in
209 * a null will default to "http,https,ftp" being valid.
210 * If a non-null schemes is specified then all valid schemes must
211 * be specified. Setting the ALLOW_ALL_SCHEMES option will
212 * ignore the contents of schemes.
213 */
214 public UrlValidator(final String[] schemes) {
215 this(schemes, 0);
216 }
217
218 /**
219 * Behaviour of validation is modified by passing in options:
220 * @param schemes The set of valid schemes.
221 * @param options The options should be set using the public constants declared in
222 * this class. To set multiple options you simply add them together. For example,
223 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
224 */
225 public UrlValidator(String[] schemes, final int options) {
226 this.options = new Flags(options);
227
228 if (this.options.isOn(ALLOW_ALL_SCHEMES)) {
229 return;
230 }
231
232 if (schemes == null) {
233 schemes = defaultSchemes;
234 }
235
236 allowedSchemes.addAll(Arrays.asList(schemes));
237 }
238
239 /**
240 * Returns the number of times the token appears in the target.
241 * @param token Token value to be counted.
242 * @param target Target value to count tokens in.
243 * @return the number of tokens.
244 */
245 protected int countToken(final String token, final String target) {
246 int tokenIndex = 0;
247 int count = 0;
248 while (tokenIndex != -1) {
249 tokenIndex = target.indexOf(token, tokenIndex);
250 if (tokenIndex > -1) {
251 tokenIndex++;
252 count++;
253 }
254 }
255 return count;
256 }
257
258 /**
259 * <p>Checks if a field has a valid URL address.</p>
260 *
261 * @param value The value validation is being performed on. A {@code null}
262 * value is considered invalid.
263 * @return true if the URL is valid.
264 */
265 public boolean isValid(final String value) {
266 if (value == null || !LEGAL_ASCII_PATTERN.matcher(value).matches()) {
267 return false;
268 }
269
270 // Check the whole url address structure
271 final Matcher urlMatcher = URL_PATTERN.matcher(value);
272 if (!urlMatcher.matches() || !isValidScheme(urlMatcher.group(PARSE_URL_SCHEME)) || !isValidAuthority(urlMatcher.group(PARSE_URL_AUTHORITY)) || !isValidPath(urlMatcher.group(PARSE_URL_PATH))) {
273 return false;
274 }
275
276 if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) {
277 return false;
278 }
279
280 if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) {
281 return false;
282 }
283
284 return true;
285 }
286
287 /**
288 * Returns true if the authority is properly formatted. An authority is the combination
289 * of hostname and port. A {@code null} authority value is considered invalid.
290 * @param authority Authority value to validate.
291 * @return true if authority (hostname and port) is valid.
292 */
293 protected boolean isValidAuthority(final String authority) {
294 if (authority == null) {
295 return false;
296 }
297
298 final InetAddressValidator inetAddressValidator =
299 InetAddressValidator.getInstance();
300
301 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
302 if (!authorityMatcher.matches()) {
303 return false;
304 }
305
306 boolean hostname = false;
307 // check if authority is IP address or hostname
308 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
309 final boolean ipV4Address = inetAddressValidator.isValid(hostIP);
310
311 if (!ipV4Address) {
312 // Domain is hostname name
313 hostname = DOMAIN_PATTERN.matcher(hostIP).matches();
314 }
315
316 //rightmost hostname will never start with a digit.
317 if (hostname) {
318 // LOW-TECH FIX FOR VALIDATOR-202
319 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
320 final char[] chars = hostIP.toCharArray();
321 int size = 1;
322 for (final char element : chars) {
323 if (element == '.') {
324 size++;
325 }
326 }
327 final String[] domainSegment = new String[size];
328 boolean match = true;
329 int segmentCount = 0;
330 int segmentLength = 0;
331
332 while (match) {
333 final Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP);
334 match = atomMatcher.matches();
335 if (match) {
336 domainSegment[segmentCount] = atomMatcher.group(1);
337 segmentLength = domainSegment[segmentCount].length() + 1;
338 hostIP =
339 segmentLength >= hostIP.length()
340 ? ""
341 : hostIP.substring(segmentLength);
342
343 segmentCount++;
344 }
345 }
346 final String topLevel = domainSegment[segmentCount - 1];
347
348
349 // First letter of top level must be an alpha
350 // Make sure there's a host name preceding the authority.
351 if (topLevel.length() < TOP_LEVEL_MIN_LEN || topLevel.length() > TOP_LEVEL_MAX_LEN || !ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()
352 || segmentCount < 2) {
353 return false;
354 }
355 }
356
357 if (!hostname && !ipV4Address) {
358 return false;
359 }
360
361 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
362 if (port != null && !PORT_PATTERN.matcher(port).matches()) {
363 return false;
364 }
365
366 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
367 if (!GenericValidator.isBlankOrNull(extra)) {
368 return false;
369 }
370
371 return true;
372 }
373
374 /**
375 * Returns true if the given fragment is null or fragments are allowed.
376 * @param fragment Fragment value to validate.
377 * @return true if fragment is valid.
378 */
379 protected boolean isValidFragment(final String fragment) {
380 if (fragment == null) {
381 return true;
382 }
383
384 return options.isOff(NO_FRAGMENTS);
385 }
386
387 /**
388 * Returns true if the path is valid. A {@code null} value is considered invalid.
389 * @param path Path value to validate.
390 * @return true if path is valid.
391 */
392 protected boolean isValidPath(final String path) {
393 if (path == null || !PATH_PATTERN.matcher(path).matches()) {
394 return false;
395 }
396
397 final int slash2Count = countToken("//", path);
398 if (options.isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
399 return false;
400 }
401
402 final int slashCount = countToken("/", path);
403 final int dot2Count = countToken("..", path);
404 if (dot2Count > 0 && slashCount - slash2Count - 1 <= dot2Count) {
405 return false;
406 }
407
408 return true;
409 }
410
411 /**
412 * Returns true if the query is null, or it's a properly formatted query string.
413 * @param query Query value to validate.
414 * @return true if query is valid.
415 */
416 protected boolean isValidQuery(final String query) {
417 if (query == null) {
418 return true;
419 }
420
421 return QUERY_PATTERN.matcher(query).matches();
422 }
423
424 /**
425 * Validate scheme. If schemes[] was initialized to a non-null,
426 * then only those schemes are allowed. Note this is slightly different
427 * than for the constructor.
428 * @param scheme The scheme to validate. A {@code null} value is considered
429 * invalid.
430 * @return true if valid.
431 */
432 protected boolean isValidScheme(final String scheme) {
433 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() || options.isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme)) {
434 return false;
435 }
436
437 return true;
438 }
439 }