1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.validator;
18
19 import java.io.Serializable;
20 import java.util.Arrays;
21 import java.util.HashSet;
22 import java.util.Set;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25
26 import org.apache.commons.validator.routines.InetAddressValidator;
27 import org.apache.commons.validator.util.Flags;
28
29 /**
30 * <p>Validates URLs.</p>
31 * Behaviour of validation is modified by passing in options:
32 * <ul>
33 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
34 * component.</li>
35 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
36 * included then fragments are flagged as illegal.</li>
37 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
38 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
39 * </ul>
40 *
41 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
42 * https://javascript.internet.com. However, this validation now bears little resemblance
43 * to the php original.</p>
44 * <pre>
45 * Example of usage:
46 * Construct a UrlValidator with valid schemes of "http", and "https".
47 *
48 * String[] schemes = {"http","https"}.
49 * UrlValidator urlValidator = new UrlValidator(schemes);
50 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
51 * System.out.println("URL is valid");
52 * } else {
53 * System.out.println("URL is invalid");
54 * }
55 *
56 * prints "URL is invalid"
57 * If instead the default constructor is used.
58 *
59 * UrlValidator urlValidator = new UrlValidator();
60 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
61 * System.out.println("URL is valid");
62 * } else {
63 * System.out.println("URL is invalid");
64 * }
65 *
66 * prints out "URL is valid"
67 * </pre>
68 *
69 * @see
70 * <a href="https://www.ietf.org/rfc/rfc2396.txt">
71 * Uniform Resource Identifiers (URI): Generic Syntax
72 * </a>
73 *
74 * @since 1.1
75 * @deprecated Use the new UrlValidator in the routines package. This class
76 * will be removed in a future release.
77 */
78 @Deprecated
79 public class UrlValidator implements Serializable {
80
81 private static final int TOP_LEVEL_MAX_LEN = 4;
82
83 private static final int TOP_LEVEL_MIN_LEN = 2;
84
85 private static final long serialVersionUID = 24137157400029593L;
86
87 /**
88 * Allows all validly formatted schemes to pass validation instead of
89 * supplying a set of valid schemes.
90 */
91 public static final int ALLOW_ALL_SCHEMES = 1 << 0;
92
93 /**
94 * Allow two slashes in the path component of the URL.
95 */
96 public static final int ALLOW_2_SLASHES = 1 << 1;
97
98 /**
99 * Enabling this options disallows any URL fragments.
100 */
101 public static final int NO_FRAGMENTS = 1 << 2;
102
103 private static final String ALPHA_CHARS = "a-zA-Z";
104
105 // NOT USED private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
106
107 private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
108
109 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
110
111 // Drop numeric, and "+-." for now
112 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\.";
113
114 private static final String ATOM = VALID_CHARS + '+';
115
116 /**
117 * This expression derived/taken from the BNF for URI (RFC2396).
118 */
119 private static final String URL_REGEX =
120 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
121 // 12 3 4 5 6 7 8 9
122 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
123
124 /**
125 * Schema/Protocol (ie. http:, ftp:, file:, etc).
126 */
127 private static final int PARSE_URL_SCHEME = 2;
128
129 /**
130 * Includes hostname/ip and port number.
131 */
132 private static final int PARSE_URL_AUTHORITY = 4;
133
134 private static final int PARSE_URL_PATH = 5;
135
136 private static final int PARSE_URL_QUERY = 7;
137
138 private static final int PARSE_URL_FRAGMENT = 9;
139
140 /**
141 * Protocol (for example, http:, ftp:, https:).
142 */
143 private static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*");
144
145 private static final String AUTHORITY_REGEX =
146 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?";
147 // 1 2 3 4
148 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
149
150 private static final int PARSE_AUTHORITY_HOST_IP = 1;
151
152 private static final int PARSE_AUTHORITY_PORT = 2;
153
154 /**
155 * Should always be empty.
156 */
157 private static final int PARSE_AUTHORITY_EXTRA = 3;
158
159 private static final Pattern PATH_PATTERN = Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$");
160
161 private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
162
163 private static final Pattern LEGAL_ASCII_PATTERN = Pattern.compile("^\\p{ASCII}+$");
164
165 private static final Pattern DOMAIN_PATTERN =
166 Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");
167
168 private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
169
170 private static final Pattern ATOM_PATTERN = Pattern.compile("^(" + ATOM + ").*?$");
171
172 private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + ALPHA_CHARS + "]");
173
174 /**
175 * Holds the set of current validation options.
176 */
177 private final Flags options;
178
179 /**
180 * The set of schemes that are allowed to be in a URL.
181 */
182 private final Set<String> allowedSchemes = new HashSet<>();
183
184 /**
185 * If no schemes are provided, default to this set.
186 */
187 protected String[] defaultSchemes = {"http", "https", "ftp"};
188
189 /**
190 * Create a UrlValidator with default properties.
191 */
192 public UrlValidator() {
193 this(null);
194 }
195
196 /**
197 * Initialize a UrlValidator with the given validation options.
198 *
199 * @param options The options should be set using the public constants declared in
200 * this class. To set multiple options you simply add them together. For example,
201 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
202 */
203 public UrlValidator(final int options) {
204 this(null, options);
205 }
206
207 /**
208 * Behavior of validation is modified by passing in several strings options:
209 *
210 * @param schemes Pass in one or more URL schemes to consider valid, passing in
211 * a null will default to "http,https,ftp" being valid.
212 * If a non-null schemes is specified then all valid schemes must
213 * be specified. Setting the ALLOW_ALL_SCHEMES option will
214 * ignore the contents of schemes.
215 */
216 public UrlValidator(final String[] schemes) {
217 this(schemes, 0);
218 }
219
220 /**
221 * Behaviour of validation is modified by passing in options:
222 *
223 * @param schemes The set of valid schemes.
224 * @param options The options should be set using the public constants declared in
225 * this class. To set multiple options you simply add them together. For example,
226 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
227 */
228 public UrlValidator(String[] schemes, final int options) {
229 this.options = new Flags(options);
230
231 if (this.options.isOn(ALLOW_ALL_SCHEMES)) {
232 return;
233 }
234
235 if (schemes == null) {
236 schemes = defaultSchemes;
237 }
238
239 allowedSchemes.addAll(Arrays.asList(schemes));
240 }
241
242 /**
243 * Returns the number of times the token appears in the target.
244 *
245 * @param token Token value to be counted.
246 * @param target Target value to count tokens in.
247 * @return the number of tokens.
248 */
249 protected int countToken(final String token, final String target) {
250 int tokenIndex = 0;
251 int count = 0;
252 while (tokenIndex != -1) {
253 tokenIndex = target.indexOf(token, tokenIndex);
254 if (tokenIndex > -1) {
255 tokenIndex++;
256 count++;
257 }
258 }
259 return count;
260 }
261
262 /**
263 * <p>Checks if a field has a valid URL address.</p>
264 *
265 * @param value The value validation is being performed on. A {@code null}
266 * value is considered invalid.
267 * @return true if the URL is valid.
268 */
269 public boolean isValid(final String value) {
270 if (value == null || !LEGAL_ASCII_PATTERN.matcher(value).matches()) {
271 return false;
272 }
273
274 // Check the whole url address structure
275 final Matcher urlMatcher = URL_PATTERN.matcher(value);
276 if (!urlMatcher.matches() || !isValidScheme(urlMatcher.group(PARSE_URL_SCHEME)) || !isValidAuthority(urlMatcher.group(PARSE_URL_AUTHORITY)) || !isValidPath(urlMatcher.group(PARSE_URL_PATH))) {
277 return false;
278 }
279
280 if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) {
281 return false;
282 }
283
284 if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) {
285 return false;
286 }
287
288 return true;
289 }
290
291 /**
292 * Returns true if the authority is properly formatted. An authority is the combination
293 * of hostname and port. A {@code null} authority value is considered invalid.
294 *
295 * @param authority Authority value to validate.
296 * @return true if authority (hostname and port) is valid.
297 */
298 protected boolean isValidAuthority(final String authority) {
299 if (authority == null) {
300 return false;
301 }
302
303 final InetAddressValidator inetAddressValidator =
304 InetAddressValidator.getInstance();
305
306 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
307 if (!authorityMatcher.matches()) {
308 return false;
309 }
310
311 boolean hostname = false;
312 // check if authority is IP address or hostname
313 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
314 final boolean ipV4Address = inetAddressValidator.isValid(hostIP);
315
316 if (!ipV4Address) {
317 // Domain is hostname name
318 hostname = DOMAIN_PATTERN.matcher(hostIP).matches();
319 }
320
321 //rightmost hostname will never start with a digit.
322 if (hostname) {
323 // LOW-TECH FIX FOR VALIDATOR-202
324 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
325 final char[] chars = hostIP.toCharArray();
326 int size = 1;
327 for (final char element : chars) {
328 if (element == '.') {
329 size++;
330 }
331 }
332 final String[] domainSegment = new String[size];
333 boolean match = true;
334 int segmentCount = 0;
335 int segmentLength = 0;
336
337 while (match) {
338 final Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP);
339 match = atomMatcher.matches();
340 if (match) {
341 domainSegment[segmentCount] = atomMatcher.group(1);
342 segmentLength = domainSegment[segmentCount].length() + 1;
343 hostIP =
344 segmentLength >= hostIP.length()
345 ? ""
346 : hostIP.substring(segmentLength);
347
348 segmentCount++;
349 }
350 }
351 final String topLevel = domainSegment[segmentCount - 1];
352
353
354 // First letter of top level must be an alpha
355 // Make sure there's a host name preceding the authority.
356 if (topLevel.length() < TOP_LEVEL_MIN_LEN || topLevel.length() > TOP_LEVEL_MAX_LEN || !ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()
357 || segmentCount < 2) {
358 return false;
359 }
360 }
361
362 if (!hostname && !ipV4Address) {
363 return false;
364 }
365
366 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
367 if (port != null && !PORT_PATTERN.matcher(port).matches()) {
368 return false;
369 }
370
371 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
372 if (!GenericValidator.isBlankOrNull(extra)) {
373 return false;
374 }
375
376 return true;
377 }
378
379 /**
380 * Returns true if the given fragment is null or fragments are allowed.
381 *
382 * @param fragment Fragment value to validate.
383 * @return true if fragment is valid.
384 */
385 protected boolean isValidFragment(final String fragment) {
386 if (fragment == null) {
387 return true;
388 }
389
390 return options.isOff(NO_FRAGMENTS);
391 }
392
393 /**
394 * Returns true if the path is valid. A {@code null} value is considered invalid.
395 *
396 * @param path Path value to validate.
397 * @return true if path is valid.
398 */
399 protected boolean isValidPath(final String path) {
400 if (path == null || !PATH_PATTERN.matcher(path).matches()) {
401 return false;
402 }
403
404 final int slash2Count = countToken("//", path);
405 if (options.isOff(ALLOW_2_SLASHES) && slash2Count > 0) {
406 return false;
407 }
408
409 final int slashCount = countToken("/", path);
410 final int dot2Count = countToken("..", path);
411 if (dot2Count > 0 && slashCount - slash2Count - 1 <= dot2Count) {
412 return false;
413 }
414
415 return true;
416 }
417
418 /**
419 * Returns true if the query is null, or it's a properly formatted query string.
420 *
421 * @param query Query value to validate.
422 * @return true if query is valid.
423 */
424 protected boolean isValidQuery(final String query) {
425 if (query == null) {
426 return true;
427 }
428
429 return QUERY_PATTERN.matcher(query).matches();
430 }
431
432 /**
433 * Validate scheme. If schemes[] was initialized to a non-null,
434 * then only those schemes are allowed. Note this is slightly different
435 * than for the constructor.
436 *
437 * @param scheme The scheme to validate. A {@code null} value is considered
438 * invalid.
439 * @return true if valid.
440 */
441 protected boolean isValidScheme(final String scheme) {
442 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() || options.isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme)) {
443 return false;
444 }
445
446 return true;
447 }
448 }