001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.validator.routines;
018
019import java.io.Serializable;
020import java.util.Arrays;
021import java.util.List;
022
023/**
024 * <p><b>Domain name</b> validation routines.</p>
025 *
026 * <p>
027 * This validator provides methods for validating Internet domain names
028 * and top-level domains.
029 * </p>
030 *
031 * <p>Domain names are evaluated according
032 * to the standards <a href="http://www.ietf.org/rfc/rfc1034.txt">RFC1034</a>,
033 * section 3, and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC1123</a>,
034 * section 2.1. No accomodation is provided for the specialized needs of
035 * other applications; if the domain name has been URL-encoded, for example,
036 * validation will fail even though the equivalent plaintext version of the
037 * same name would have passed.
038 * </p>
039 *
040 * <p>
041 * Validation is also provided for top-level domains (TLDs) as defined and
042 * maintained by the Internet Assigned Numbers Authority (IANA):
043 * </p>
044 *
045 *   <ul>
046 *     <li>{@link #isValidInfrastructureTld} - validates infrastructure TLDs
047 *         (<code>.arpa</code>, etc.)</li>
048 *     <li>{@link #isValidGenericTld} - validates generic TLDs
049 *         (<code>.com, .org</code>, etc.)</li>
050 *     <li>{@link #isValidCountryCodeTld} - validates country code TLDs
051 *         (<code>.us, .uk, .cn</code>, etc.)</li>
052 *   </ul>
053 *
054 * <p>
055 * (<b>NOTE</b>: This class does not provide IP address lookup for domain names or
056 * methods to ensure that a given domain name matches a specific IP; see
057 * {@link java.net.InetAddress} for that functionality.)
058 * </p>
059 *
060 * @version $Revision: 1227719 $ $Date: 2012-01-05 18:45:51 +0100 (Do, 05 Jan 2012) $
061 * @since Validator 1.4
062 */
063public class DomainValidator implements Serializable {
064
065    private static final long serialVersionUID = -4407125112880174009L;
066
067    // Regular expression strings for hostnames (derived from RFC2396 and RFC 1123)
068    private static final String DOMAIN_LABEL_REGEX = "\\p{Alnum}(?>[\\p{Alnum}-]*\\p{Alnum})*";
069    private static final String TOP_LABEL_REGEX = "\\p{Alpha}{2,}";
070    private static final String DOMAIN_NAME_REGEX =
071            "^(?:" + DOMAIN_LABEL_REGEX + "\\.)+" + "(" + TOP_LABEL_REGEX + ")$";
072
073    private final boolean allowLocal;
074
075    /**
076     * Singleton instance of this validator, which
077     *  doesn't consider local addresses as valid.
078     */
079    private static final DomainValidator DOMAIN_VALIDATOR = new DomainValidator(false);
080
081    /**
082     * Singleton instance of this validator, which does
083     *  consider local addresses valid.
084     */
085    private static final DomainValidator DOMAIN_VALIDATOR_WITH_LOCAL = new DomainValidator(true);
086
087    /**
088     * RegexValidator for matching domains.
089     */
090    private final RegexValidator domainRegex =
091            new RegexValidator(DOMAIN_NAME_REGEX);
092    /**
093     * RegexValidator for matching the a local hostname
094     */
095    private final RegexValidator hostnameRegex =
096            new RegexValidator(DOMAIN_LABEL_REGEX);
097
098    /**
099     * Returns the singleton instance of this validator. It
100     *  will not consider local addresses as valid.
101     * @return the singleton instance of this validator
102     */
103    public static DomainValidator getInstance() {
104        return DOMAIN_VALIDATOR;
105    }
106
107    /**
108     * Returns the singleton instance of this validator,
109     *  with local validation as required.
110     * @param allowLocal Should local addresses be considered valid?
111     * @return the singleton instance of this validator
112     */
113    public static DomainValidator getInstance(boolean allowLocal) {
114       if(allowLocal) {
115          return DOMAIN_VALIDATOR_WITH_LOCAL;
116       }
117       return DOMAIN_VALIDATOR;
118    }
119
120    /** Private constructor. */
121    private DomainValidator(boolean allowLocal) {
122       this.allowLocal = allowLocal;
123    }
124
125    /**
126     * Returns true if the specified <code>String</code> parses
127     * as a valid domain name with a recognized top-level domain.
128     * The parsing is case-sensitive.
129     * @param domain the parameter to check for domain name syntax
130     * @return true if the parameter is a valid domain name
131     */
132    public boolean isValid(String domain) {
133        String[] groups = domainRegex.match(domain);
134        if (groups != null && groups.length > 0) {
135            return isValidTld(groups[0]);
136        } else if(allowLocal) {
137            if (hostnameRegex.isValid(domain)) {
138               return true;
139            }
140        }
141        return false;
142    }
143
144    /**
145     * Returns true if the specified <code>String</code> matches any
146     * IANA-defined top-level domain. Leading dots are ignored if present.
147     * The search is case-sensitive.
148     * @param tld the parameter to check for TLD status
149     * @return true if the parameter is a TLD
150     */
151    public boolean isValidTld(String tld) {
152        if(allowLocal && isValidLocalTld(tld)) {
153           return true;
154        }
155        return isValidInfrastructureTld(tld)
156                || isValidGenericTld(tld)
157                || isValidCountryCodeTld(tld);
158    }
159
160    /**
161     * Returns true if the specified <code>String</code> matches any
162     * IANA-defined infrastructure top-level domain. Leading dots are
163     * ignored if present. The search is case-sensitive.
164     * @param iTld the parameter to check for infrastructure TLD status
165     * @return true if the parameter is an infrastructure TLD
166     */
167    public boolean isValidInfrastructureTld(String iTld) {
168        return INFRASTRUCTURE_TLD_LIST.contains(chompLeadingDot(iTld.toLowerCase()));
169    }
170
171    /**
172     * Returns true if the specified <code>String</code> matches any
173     * IANA-defined generic top-level domain. Leading dots are ignored
174     * if present. The search is case-sensitive.
175     * @param gTld the parameter to check for generic TLD status
176     * @return true if the parameter is a generic TLD
177     */
178    public boolean isValidGenericTld(String gTld) {
179        return GENERIC_TLD_LIST.contains(chompLeadingDot(gTld.toLowerCase()));
180    }
181
182    /**
183     * Returns true if the specified <code>String</code> matches any
184     * IANA-defined country code top-level domain. Leading dots are
185     * ignored if present. The search is case-sensitive.
186     * @param ccTld the parameter to check for country code TLD status
187     * @return true if the parameter is a country code TLD
188     */
189    public boolean isValidCountryCodeTld(String ccTld) {
190        return COUNTRY_CODE_TLD_LIST.contains(chompLeadingDot(ccTld.toLowerCase()));
191    }
192
193    /**
194     * Returns true if the specified <code>String</code> matches any
195     * widely used "local" domains (localhost or localdomain). Leading dots are
196     *  ignored if present. The search is case-sensitive.
197     * @param iTld the parameter to check for local TLD status
198     * @return true if the parameter is an local TLD
199     */
200    public boolean isValidLocalTld(String iTld) {
201        return LOCAL_TLD_LIST.contains(chompLeadingDot(iTld.toLowerCase()));
202    }
203
204    private String chompLeadingDot(String str) {
205        if (str.startsWith(".")) {
206            return str.substring(1);
207        } else {
208            return str;
209        }
210    }
211
212    // ---------------------------------------------
213    // ----- TLDs defined by IANA
214    // ----- Authoritative and comprehensive list at:
215    // ----- http://data.iana.org/TLD/tlds-alpha-by-domain.txt
216
217    private static final String[] INFRASTRUCTURE_TLDS = new String[] {
218        "arpa",               // internet infrastructure
219        "root"                // diagnostic marker for non-truncated root zone
220    };
221
222    private static final String[] GENERIC_TLDS = new String[] {
223        "aero",               // air transport industry
224        "asia",               // Pan-Asia/Asia Pacific
225        "biz",                // businesses
226        "cat",                // Catalan linguistic/cultural community
227        "com",                // commercial enterprises
228        "coop",               // cooperative associations
229        "info",               // informational sites
230        "jobs",               // Human Resource managers
231        "mobi",               // mobile products and services
232        "museum",             // museums, surprisingly enough
233        "name",               // individuals' sites
234        "net",                // internet support infrastructure/business
235        "org",                // noncommercial organizations
236        "pro",                // credentialed professionals and entities
237        "tel",                // contact data for businesses and individuals
238        "travel",             // entities in the travel industry
239        "gov",                // United States Government
240        "edu",                // accredited postsecondary US education entities
241        "mil",                // United States Military
242        "int"                 // organizations established by international treaty
243    };
244
245    private static final String[] COUNTRY_CODE_TLDS = new String[] {
246        "ac",                 // Ascension Island
247        "ad",                 // Andorra
248        "ae",                 // United Arab Emirates
249        "af",                 // Afghanistan
250        "ag",                 // Antigua and Barbuda
251        "ai",                 // Anguilla
252        "al",                 // Albania
253        "am",                 // Armenia
254        "an",                 // Netherlands Antilles
255        "ao",                 // Angola
256        "aq",                 // Antarctica
257        "ar",                 // Argentina
258        "as",                 // American Samoa
259        "at",                 // Austria
260        "au",                 // Australia (includes Ashmore and Cartier Islands and Coral Sea Islands)
261        "aw",                 // Aruba
262        "ax",                 // Åland
263        "az",                 // Azerbaijan
264        "ba",                 // Bosnia and Herzegovina
265        "bb",                 // Barbados
266        "bd",                 // Bangladesh
267        "be",                 // Belgium
268        "bf",                 // Burkina Faso
269        "bg",                 // Bulgaria
270        "bh",                 // Bahrain
271        "bi",                 // Burundi
272        "bj",                 // Benin
273        "bm",                 // Bermuda
274        "bn",                 // Brunei Darussalam
275        "bo",                 // Bolivia
276        "br",                 // Brazil
277        "bs",                 // Bahamas
278        "bt",                 // Bhutan
279        "bv",                 // Bouvet Island
280        "bw",                 // Botswana
281        "by",                 // Belarus
282        "bz",                 // Belize
283        "ca",                 // Canada
284        "cc",                 // Cocos (Keeling) Islands
285        "cd",                 // Democratic Republic of the Congo (formerly Zaire)
286        "cf",                 // Central African Republic
287        "cg",                 // Republic of the Congo
288        "ch",                 // Switzerland
289        "ci",                 // Côte d'Ivoire
290        "ck",                 // Cook Islands
291        "cl",                 // Chile
292        "cm",                 // Cameroon
293        "cn",                 // China, mainland
294        "co",                 // Colombia
295        "cr",                 // Costa Rica
296        "cu",                 // Cuba
297        "cv",                 // Cape Verde
298        "cx",                 // Christmas Island
299        "cy",                 // Cyprus
300        "cz",                 // Czech Republic
301        "de",                 // Germany
302        "dj",                 // Djibouti
303        "dk",                 // Denmark
304        "dm",                 // Dominica
305        "do",                 // Dominican Republic
306        "dz",                 // Algeria
307        "ec",                 // Ecuador
308        "ee",                 // Estonia
309        "eg",                 // Egypt
310        "er",                 // Eritrea
311        "es",                 // Spain
312        "et",                 // Ethiopia
313        "eu",                 // European Union
314        "fi",                 // Finland
315        "fj",                 // Fiji
316        "fk",                 // Falkland Islands
317        "fm",                 // Federated States of Micronesia
318        "fo",                 // Faroe Islands
319        "fr",                 // France
320        "ga",                 // Gabon
321        "gb",                 // Great Britain (United Kingdom)
322        "gd",                 // Grenada
323        "ge",                 // Georgia
324        "gf",                 // French Guiana
325        "gg",                 // Guernsey
326        "gh",                 // Ghana
327        "gi",                 // Gibraltar
328        "gl",                 // Greenland
329        "gm",                 // The Gambia
330        "gn",                 // Guinea
331        "gp",                 // Guadeloupe
332        "gq",                 // Equatorial Guinea
333        "gr",                 // Greece
334        "gs",                 // South Georgia and the South Sandwich Islands
335        "gt",                 // Guatemala
336        "gu",                 // Guam
337        "gw",                 // Guinea-Bissau
338        "gy",                 // Guyana
339        "hk",                 // Hong Kong
340        "hm",                 // Heard Island and McDonald Islands
341        "hn",                 // Honduras
342        "hr",                 // Croatia (Hrvatska)
343        "ht",                 // Haiti
344        "hu",                 // Hungary
345        "id",                 // Indonesia
346        "ie",                 // Ireland (Éire)
347        "il",                 // Israel
348        "im",                 // Isle of Man
349        "in",                 // India
350        "io",                 // British Indian Ocean Territory
351        "iq",                 // Iraq
352        "ir",                 // Iran
353        "is",                 // Iceland
354        "it",                 // Italy
355        "je",                 // Jersey
356        "jm",                 // Jamaica
357        "jo",                 // Jordan
358        "jp",                 // Japan
359        "ke",                 // Kenya
360        "kg",                 // Kyrgyzstan
361        "kh",                 // Cambodia (Khmer)
362        "ki",                 // Kiribati
363        "km",                 // Comoros
364        "kn",                 // Saint Kitts and Nevis
365        "kp",                 // North Korea
366        "kr",                 // South Korea
367        "kw",                 // Kuwait
368        "ky",                 // Cayman Islands
369        "kz",                 // Kazakhstan
370        "la",                 // Laos (currently being marketed as the official domain for Los Angeles)
371        "lb",                 // Lebanon
372        "lc",                 // Saint Lucia
373        "li",                 // Liechtenstein
374        "lk",                 // Sri Lanka
375        "lr",                 // Liberia
376        "ls",                 // Lesotho
377        "lt",                 // Lithuania
378        "lu",                 // Luxembourg
379        "lv",                 // Latvia
380        "ly",                 // Libya
381        "ma",                 // Morocco
382        "mc",                 // Monaco
383        "md",                 // Moldova
384        "me",                 // Montenegro
385        "mg",                 // Madagascar
386        "mh",                 // Marshall Islands
387        "mk",                 // Republic of Macedonia
388        "ml",                 // Mali
389        "mm",                 // Myanmar
390        "mn",                 // Mongolia
391        "mo",                 // Macau
392        "mp",                 // Northern Mariana Islands
393        "mq",                 // Martinique
394        "mr",                 // Mauritania
395        "ms",                 // Montserrat
396        "mt",                 // Malta
397        "mu",                 // Mauritius
398        "mv",                 // Maldives
399        "mw",                 // Malawi
400        "mx",                 // Mexico
401        "my",                 // Malaysia
402        "mz",                 // Mozambique
403        "na",                 // Namibia
404        "nc",                 // New Caledonia
405        "ne",                 // Niger
406        "nf",                 // Norfolk Island
407        "ng",                 // Nigeria
408        "ni",                 // Nicaragua
409        "nl",                 // Netherlands
410        "no",                 // Norway
411        "np",                 // Nepal
412        "nr",                 // Nauru
413        "nu",                 // Niue
414        "nz",                 // New Zealand
415        "om",                 // Oman
416        "pa",                 // Panama
417        "pe",                 // Peru
418        "pf",                 // French Polynesia With Clipperton Island
419        "pg",                 // Papua New Guinea
420        "ph",                 // Philippines
421        "pk",                 // Pakistan
422        "pl",                 // Poland
423        "pm",                 // Saint-Pierre and Miquelon
424        "pn",                 // Pitcairn Islands
425        "pr",                 // Puerto Rico
426        "ps",                 // Palestinian territories (PA-controlled West Bank and Gaza Strip)
427        "pt",                 // Portugal
428        "pw",                 // Palau
429        "py",                 // Paraguay
430        "qa",                 // Qatar
431        "re",                 // Réunion
432        "ro",                 // Romania
433        "rs",                 // Serbia
434        "ru",                 // Russia
435        "rw",                 // Rwanda
436        "sa",                 // Saudi Arabia
437        "sb",                 // Solomon Islands
438        "sc",                 // Seychelles
439        "sd",                 // Sudan
440        "se",                 // Sweden
441        "sg",                 // Singapore
442        "sh",                 // Saint Helena
443        "si",                 // Slovenia
444        "sj",                 // Svalbard and Jan Mayen Islands Not in use (Norwegian dependencies; see .no)
445        "sk",                 // Slovakia
446        "sl",                 // Sierra Leone
447        "sm",                 // San Marino
448        "sn",                 // Senegal
449        "so",                 // Somalia
450        "sr",                 // Suriname
451        "st",                 // São Tomé and Príncipe
452        "su",                 // Soviet Union (deprecated)
453        "sv",                 // El Salvador
454        "sy",                 // Syria
455        "sz",                 // Swaziland
456        "tc",                 // Turks and Caicos Islands
457        "td",                 // Chad
458        "tf",                 // French Southern and Antarctic Lands
459        "tg",                 // Togo
460        "th",                 // Thailand
461        "tj",                 // Tajikistan
462        "tk",                 // Tokelau
463        "tl",                 // East Timor (deprecated old code)
464        "tm",                 // Turkmenistan
465        "tn",                 // Tunisia
466        "to",                 // Tonga
467        "tp",                 // East Timor
468        "tr",                 // Turkey
469        "tt",                 // Trinidad and Tobago
470        "tv",                 // Tuvalu
471        "tw",                 // Taiwan, Republic of China
472        "tz",                 // Tanzania
473        "ua",                 // Ukraine
474        "ug",                 // Uganda
475        "uk",                 // United Kingdom
476        "um",                 // United States Minor Outlying Islands
477        "us",                 // United States of America
478        "uy",                 // Uruguay
479        "uz",                 // Uzbekistan
480        "va",                 // Vatican City State
481        "vc",                 // Saint Vincent and the Grenadines
482        "ve",                 // Venezuela
483        "vg",                 // British Virgin Islands
484        "vi",                 // U.S. Virgin Islands
485        "vn",                 // Vietnam
486        "vu",                 // Vanuatu
487        "wf",                 // Wallis and Futuna
488        "ws",                 // Samoa (formerly Western Samoa)
489        "ye",                 // Yemen
490        "yt",                 // Mayotte
491        "yu",                 // Serbia and Montenegro (originally Yugoslavia)
492        "za",                 // South Africa
493        "zm",                 // Zambia
494        "zw",                 // Zimbabwe
495    };
496
497    private static final String[] LOCAL_TLDS = new String[] {
498       "localhost",           // RFC2606 defined
499       "localdomain"          // Also widely used as localhost.localdomain
500   };
501
502    private static final List INFRASTRUCTURE_TLD_LIST = Arrays.asList(INFRASTRUCTURE_TLDS);
503    private static final List GENERIC_TLD_LIST = Arrays.asList(GENERIC_TLDS);
504    private static final List COUNTRY_CODE_TLD_LIST = Arrays.asList(COUNTRY_CODE_TLDS);
505    private static final List LOCAL_TLD_LIST = Arrays.asList(LOCAL_TLDS);
506}