View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.validator.routines;
18  
19  import java.io.Serializable;
20  import java.util.Arrays;
21  import java.util.List;
22  
23  /**
24   * <p><b>Domain name</b> validation routines.</p>
25   *
26   * <p>
27   * This validator provides methods for validating Internet domain names
28   * and top-level domains.
29   * </p>
30   *
31   * <p>Domain names are evaluated according
32   * to the standards <a href="http://www.ietf.org/rfc/rfc1034.txt">RFC1034</a>,
33   * section 3, and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC1123</a>,
34   * section 2.1. No accomodation is provided for the specialized needs of
35   * other applications; if the domain name has been URL-encoded, for example,
36   * validation will fail even though the equivalent plaintext version of the
37   * same name would have passed.
38   * </p>
39   *
40   * <p>
41   * Validation is also provided for top-level domains (TLDs) as defined and
42   * maintained by the Internet Assigned Numbers Authority (IANA):
43   * </p>
44   *
45   *   <ul>
46   *     <li>{@link #isValidInfrastructureTld} - validates infrastructure TLDs
47   *         (<code>.arpa</code>, etc.)</li>
48   *     <li>{@link #isValidGenericTld} - validates generic TLDs
49   *         (<code>.com, .org</code>, etc.)</li>
50   *     <li>{@link #isValidCountryCodeTld} - validates country code TLDs
51   *         (<code>.us, .uk, .cn</code>, etc.)</li>
52   *   </ul>
53   *
54   * <p>
55   * (<b>NOTE</b>: This class does not provide IP address lookup for domain names or
56   * methods to ensure that a given domain name matches a specific IP; see
57   * {@link java.net.InetAddress} for that functionality.)
58   * </p>
59   *
60   * @version $Revision: 1227719 $ $Date: 2012-01-05 18:45:51 +0100 (Do, 05 Jan 2012) $
61   * @since Validator 1.4
62   */
63  public class DomainValidator implements Serializable {
64  
65      private static final long serialVersionUID = -4407125112880174009L;
66  
67      // Regular expression strings for hostnames (derived from RFC2396 and RFC 1123)
68      private static final String DOMAIN_LABEL_REGEX = "\\p{Alnum}(?>[\\p{Alnum}-]*\\p{Alnum})*";
69      private static final String TOP_LABEL_REGEX = "\\p{Alpha}{2,}";
70      private static final String DOMAIN_NAME_REGEX =
71              "^(?:" + DOMAIN_LABEL_REGEX + "\\.)+" + "(" + TOP_LABEL_REGEX + ")$";
72  
73      private final boolean allowLocal;
74  
75      /**
76       * Singleton instance of this validator, which
77       *  doesn't consider local addresses as valid.
78       */
79      private static final DomainValidator DOMAIN_VALIDATOR = new DomainValidator(false);
80  
81      /**
82       * Singleton instance of this validator, which does
83       *  consider local addresses valid.
84       */
85      private static final DomainValidator DOMAIN_VALIDATOR_WITH_LOCAL = new DomainValidator(true);
86  
87      /**
88       * RegexValidator for matching domains.
89       */
90      private final RegexValidator domainRegex =
91              new RegexValidator(DOMAIN_NAME_REGEX);
92      /**
93       * RegexValidator for matching the a local hostname
94       */
95      private final RegexValidator hostnameRegex =
96              new RegexValidator(DOMAIN_LABEL_REGEX);
97  
98      /**
99       * Returns the singleton instance of this validator. It
100      *  will not consider local addresses as valid.
101      * @return the singleton instance of this validator
102      */
103     public static DomainValidator getInstance() {
104         return DOMAIN_VALIDATOR;
105     }
106 
107     /**
108      * Returns the singleton instance of this validator,
109      *  with local validation as required.
110      * @param allowLocal Should local addresses be considered valid?
111      * @return the singleton instance of this validator
112      */
113     public static DomainValidator getInstance(boolean allowLocal) {
114        if(allowLocal) {
115           return DOMAIN_VALIDATOR_WITH_LOCAL;
116        }
117        return DOMAIN_VALIDATOR;
118     }
119 
120     /** Private constructor. */
121     private DomainValidator(boolean allowLocal) {
122        this.allowLocal = allowLocal;
123     }
124 
125     /**
126      * Returns true if the specified <code>String</code> parses
127      * as a valid domain name with a recognized top-level domain.
128      * The parsing is case-sensitive.
129      * @param domain the parameter to check for domain name syntax
130      * @return true if the parameter is a valid domain name
131      */
132     public boolean isValid(String domain) {
133         String[] groups = domainRegex.match(domain);
134         if (groups != null && groups.length > 0) {
135             return isValidTld(groups[0]);
136         } else if(allowLocal) {
137             if (hostnameRegex.isValid(domain)) {
138                return true;
139             }
140         }
141         return false;
142     }
143 
144     /**
145      * Returns true if the specified <code>String</code> matches any
146      * IANA-defined top-level domain. Leading dots are ignored if present.
147      * The search is case-sensitive.
148      * @param tld the parameter to check for TLD status
149      * @return true if the parameter is a TLD
150      */
151     public boolean isValidTld(String tld) {
152         if(allowLocal && isValidLocalTld(tld)) {
153            return true;
154         }
155         return isValidInfrastructureTld(tld)
156                 || isValidGenericTld(tld)
157                 || isValidCountryCodeTld(tld);
158     }
159 
160     /**
161      * Returns true if the specified <code>String</code> matches any
162      * IANA-defined infrastructure top-level domain. Leading dots are
163      * ignored if present. The search is case-sensitive.
164      * @param iTld the parameter to check for infrastructure TLD status
165      * @return true if the parameter is an infrastructure TLD
166      */
167     public boolean isValidInfrastructureTld(String iTld) {
168         return INFRASTRUCTURE_TLD_LIST.contains(chompLeadingDot(iTld.toLowerCase()));
169     }
170 
171     /**
172      * Returns true if the specified <code>String</code> matches any
173      * IANA-defined generic top-level domain. Leading dots are ignored
174      * if present. The search is case-sensitive.
175      * @param gTld the parameter to check for generic TLD status
176      * @return true if the parameter is a generic TLD
177      */
178     public boolean isValidGenericTld(String gTld) {
179         return GENERIC_TLD_LIST.contains(chompLeadingDot(gTld.toLowerCase()));
180     }
181 
182     /**
183      * Returns true if the specified <code>String</code> matches any
184      * IANA-defined country code top-level domain. Leading dots are
185      * ignored if present. The search is case-sensitive.
186      * @param ccTld the parameter to check for country code TLD status
187      * @return true if the parameter is a country code TLD
188      */
189     public boolean isValidCountryCodeTld(String ccTld) {
190         return COUNTRY_CODE_TLD_LIST.contains(chompLeadingDot(ccTld.toLowerCase()));
191     }
192 
193     /**
194      * Returns true if the specified <code>String</code> matches any
195      * widely used "local" domains (localhost or localdomain). Leading dots are
196      *  ignored if present. The search is case-sensitive.
197      * @param iTld the parameter to check for local TLD status
198      * @return true if the parameter is an local TLD
199      */
200     public boolean isValidLocalTld(String iTld) {
201         return LOCAL_TLD_LIST.contains(chompLeadingDot(iTld.toLowerCase()));
202     }
203 
204     private String chompLeadingDot(String str) {
205         if (str.startsWith(".")) {
206             return str.substring(1);
207         } else {
208             return str;
209         }
210     }
211 
212     // ---------------------------------------------
213     // ----- TLDs defined by IANA
214     // ----- Authoritative and comprehensive list at:
215     // ----- http://data.iana.org/TLD/tlds-alpha-by-domain.txt
216 
217     private static final String[] INFRASTRUCTURE_TLDS = new String[] {
218         "arpa",               // internet infrastructure
219         "root"                // diagnostic marker for non-truncated root zone
220     };
221 
222     private static final String[] GENERIC_TLDS = new String[] {
223         "aero",               // air transport industry
224         "asia",               // Pan-Asia/Asia Pacific
225         "biz",                // businesses
226         "cat",                // Catalan linguistic/cultural community
227         "com",                // commercial enterprises
228         "coop",               // cooperative associations
229         "info",               // informational sites
230         "jobs",               // Human Resource managers
231         "mobi",               // mobile products and services
232         "museum",             // museums, surprisingly enough
233         "name",               // individuals' sites
234         "net",                // internet support infrastructure/business
235         "org",                // noncommercial organizations
236         "pro",                // credentialed professionals and entities
237         "tel",                // contact data for businesses and individuals
238         "travel",             // entities in the travel industry
239         "gov",                // United States Government
240         "edu",                // accredited postsecondary US education entities
241         "mil",                // United States Military
242         "int"                 // organizations established by international treaty
243     };
244 
245     private static final String[] COUNTRY_CODE_TLDS = new String[] {
246         "ac",                 // Ascension Island
247         "ad",                 // Andorra
248         "ae",                 // United Arab Emirates
249         "af",                 // Afghanistan
250         "ag",                 // Antigua and Barbuda
251         "ai",                 // Anguilla
252         "al",                 // Albania
253         "am",                 // Armenia
254         "an",                 // Netherlands Antilles
255         "ao",                 // Angola
256         "aq",                 // Antarctica
257         "ar",                 // Argentina
258         "as",                 // American Samoa
259         "at",                 // Austria
260         "au",                 // Australia (includes Ashmore and Cartier Islands and Coral Sea Islands)
261         "aw",                 // Aruba
262         "ax",                 // Åland
263         "az",                 // Azerbaijan
264         "ba",                 // Bosnia and Herzegovina
265         "bb",                 // Barbados
266         "bd",                 // Bangladesh
267         "be",                 // Belgium
268         "bf",                 // Burkina Faso
269         "bg",                 // Bulgaria
270         "bh",                 // Bahrain
271         "bi",                 // Burundi
272         "bj",                 // Benin
273         "bm",                 // Bermuda
274         "bn",                 // Brunei Darussalam
275         "bo",                 // Bolivia
276         "br",                 // Brazil
277         "bs",                 // Bahamas
278         "bt",                 // Bhutan
279         "bv",                 // Bouvet Island
280         "bw",                 // Botswana
281         "by",                 // Belarus
282         "bz",                 // Belize
283         "ca",                 // Canada
284         "cc",                 // Cocos (Keeling) Islands
285         "cd",                 // Democratic Republic of the Congo (formerly Zaire)
286         "cf",                 // Central African Republic
287         "cg",                 // Republic of the Congo
288         "ch",                 // Switzerland
289         "ci",                 // Côte d'Ivoire
290         "ck",                 // Cook Islands
291         "cl",                 // Chile
292         "cm",                 // Cameroon
293         "cn",                 // China, mainland
294         "co",                 // Colombia
295         "cr",                 // Costa Rica
296         "cu",                 // Cuba
297         "cv",                 // Cape Verde
298         "cx",                 // Christmas Island
299         "cy",                 // Cyprus
300         "cz",                 // Czech Republic
301         "de",                 // Germany
302         "dj",                 // Djibouti
303         "dk",                 // Denmark
304         "dm",                 // Dominica
305         "do",                 // Dominican Republic
306         "dz",                 // Algeria
307         "ec",                 // Ecuador
308         "ee",                 // Estonia
309         "eg",                 // Egypt
310         "er",                 // Eritrea
311         "es",                 // Spain
312         "et",                 // Ethiopia
313         "eu",                 // European Union
314         "fi",                 // Finland
315         "fj",                 // Fiji
316         "fk",                 // Falkland Islands
317         "fm",                 // Federated States of Micronesia
318         "fo",                 // Faroe Islands
319         "fr",                 // France
320         "ga",                 // Gabon
321         "gb",                 // Great Britain (United Kingdom)
322         "gd",                 // Grenada
323         "ge",                 // Georgia
324         "gf",                 // French Guiana
325         "gg",                 // Guernsey
326         "gh",                 // Ghana
327         "gi",                 // Gibraltar
328         "gl",                 // Greenland
329         "gm",                 // The Gambia
330         "gn",                 // Guinea
331         "gp",                 // Guadeloupe
332         "gq",                 // Equatorial Guinea
333         "gr",                 // Greece
334         "gs",                 // South Georgia and the South Sandwich Islands
335         "gt",                 // Guatemala
336         "gu",                 // Guam
337         "gw",                 // Guinea-Bissau
338         "gy",                 // Guyana
339         "hk",                 // Hong Kong
340         "hm",                 // Heard Island and McDonald Islands
341         "hn",                 // Honduras
342         "hr",                 // Croatia (Hrvatska)
343         "ht",                 // Haiti
344         "hu",                 // Hungary
345         "id",                 // Indonesia
346         "ie",                 // Ireland (Éire)
347         "il",                 // Israel
348         "im",                 // Isle of Man
349         "in",                 // India
350         "io",                 // British Indian Ocean Territory
351         "iq",                 // Iraq
352         "ir",                 // Iran
353         "is",                 // Iceland
354         "it",                 // Italy
355         "je",                 // Jersey
356         "jm",                 // Jamaica
357         "jo",                 // Jordan
358         "jp",                 // Japan
359         "ke",                 // Kenya
360         "kg",                 // Kyrgyzstan
361         "kh",                 // Cambodia (Khmer)
362         "ki",                 // Kiribati
363         "km",                 // Comoros
364         "kn",                 // Saint Kitts and Nevis
365         "kp",                 // North Korea
366         "kr",                 // South Korea
367         "kw",                 // Kuwait
368         "ky",                 // Cayman Islands
369         "kz",                 // Kazakhstan
370         "la",                 // Laos (currently being marketed as the official domain for Los Angeles)
371         "lb",                 // Lebanon
372         "lc",                 // Saint Lucia
373         "li",                 // Liechtenstein
374         "lk",                 // Sri Lanka
375         "lr",                 // Liberia
376         "ls",                 // Lesotho
377         "lt",                 // Lithuania
378         "lu",                 // Luxembourg
379         "lv",                 // Latvia
380         "ly",                 // Libya
381         "ma",                 // Morocco
382         "mc",                 // Monaco
383         "md",                 // Moldova
384         "me",                 // Montenegro
385         "mg",                 // Madagascar
386         "mh",                 // Marshall Islands
387         "mk",                 // Republic of Macedonia
388         "ml",                 // Mali
389         "mm",                 // Myanmar
390         "mn",                 // Mongolia
391         "mo",                 // Macau
392         "mp",                 // Northern Mariana Islands
393         "mq",                 // Martinique
394         "mr",                 // Mauritania
395         "ms",                 // Montserrat
396         "mt",                 // Malta
397         "mu",                 // Mauritius
398         "mv",                 // Maldives
399         "mw",                 // Malawi
400         "mx",                 // Mexico
401         "my",                 // Malaysia
402         "mz",                 // Mozambique
403         "na",                 // Namibia
404         "nc",                 // New Caledonia
405         "ne",                 // Niger
406         "nf",                 // Norfolk Island
407         "ng",                 // Nigeria
408         "ni",                 // Nicaragua
409         "nl",                 // Netherlands
410         "no",                 // Norway
411         "np",                 // Nepal
412         "nr",                 // Nauru
413         "nu",                 // Niue
414         "nz",                 // New Zealand
415         "om",                 // Oman
416         "pa",                 // Panama
417         "pe",                 // Peru
418         "pf",                 // French Polynesia With Clipperton Island
419         "pg",                 // Papua New Guinea
420         "ph",                 // Philippines
421         "pk",                 // Pakistan
422         "pl",                 // Poland
423         "pm",                 // Saint-Pierre and Miquelon
424         "pn",                 // Pitcairn Islands
425         "pr",                 // Puerto Rico
426         "ps",                 // Palestinian territories (PA-controlled West Bank and Gaza Strip)
427         "pt",                 // Portugal
428         "pw",                 // Palau
429         "py",                 // Paraguay
430         "qa",                 // Qatar
431         "re",                 // Réunion
432         "ro",                 // Romania
433         "rs",                 // Serbia
434         "ru",                 // Russia
435         "rw",                 // Rwanda
436         "sa",                 // Saudi Arabia
437         "sb",                 // Solomon Islands
438         "sc",                 // Seychelles
439         "sd",                 // Sudan
440         "se",                 // Sweden
441         "sg",                 // Singapore
442         "sh",                 // Saint Helena
443         "si",                 // Slovenia
444         "sj",                 // Svalbard and Jan Mayen Islands Not in use (Norwegian dependencies; see .no)
445         "sk",                 // Slovakia
446         "sl",                 // Sierra Leone
447         "sm",                 // San Marino
448         "sn",                 // Senegal
449         "so",                 // Somalia
450         "sr",                 // Suriname
451         "st",                 // São Tomé and Príncipe
452         "su",                 // Soviet Union (deprecated)
453         "sv",                 // El Salvador
454         "sy",                 // Syria
455         "sz",                 // Swaziland
456         "tc",                 // Turks and Caicos Islands
457         "td",                 // Chad
458         "tf",                 // French Southern and Antarctic Lands
459         "tg",                 // Togo
460         "th",                 // Thailand
461         "tj",                 // Tajikistan
462         "tk",                 // Tokelau
463         "tl",                 // East Timor (deprecated old code)
464         "tm",                 // Turkmenistan
465         "tn",                 // Tunisia
466         "to",                 // Tonga
467         "tp",                 // East Timor
468         "tr",                 // Turkey
469         "tt",                 // Trinidad and Tobago
470         "tv",                 // Tuvalu
471         "tw",                 // Taiwan, Republic of China
472         "tz",                 // Tanzania
473         "ua",                 // Ukraine
474         "ug",                 // Uganda
475         "uk",                 // United Kingdom
476         "um",                 // United States Minor Outlying Islands
477         "us",                 // United States of America
478         "uy",                 // Uruguay
479         "uz",                 // Uzbekistan
480         "va",                 // Vatican City State
481         "vc",                 // Saint Vincent and the Grenadines
482         "ve",                 // Venezuela
483         "vg",                 // British Virgin Islands
484         "vi",                 // U.S. Virgin Islands
485         "vn",                 // Vietnam
486         "vu",                 // Vanuatu
487         "wf",                 // Wallis and Futuna
488         "ws",                 // Samoa (formerly Western Samoa)
489         "ye",                 // Yemen
490         "yt",                 // Mayotte
491         "yu",                 // Serbia and Montenegro (originally Yugoslavia)
492         "za",                 // South Africa
493         "zm",                 // Zambia
494         "zw",                 // Zimbabwe
495     };
496 
497     private static final String[] LOCAL_TLDS = new String[] {
498        "localhost",           // RFC2606 defined
499        "localdomain"          // Also widely used as localhost.localdomain
500    };
501 
502     private static final List INFRASTRUCTURE_TLD_LIST = Arrays.asList(INFRASTRUCTURE_TLDS);
503     private static final List GENERIC_TLD_LIST = Arrays.asList(GENERIC_TLDS);
504     private static final List COUNTRY_CODE_TLD_LIST = Arrays.asList(COUNTRY_CODE_TLDS);
505     private static final List LOCAL_TLD_LIST = Arrays.asList(LOCAL_TLDS);
506 }