001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.validator.routines;
018
019import java.io.Serializable;
020import java.lang.reflect.InvocationTargetException;
021import java.lang.reflect.Method;
022import java.util.Arrays;
023import java.util.Locale;
024
025/**
026 * <p><b>Domain name</b> validation routines.</p>
027 *
028 * <p>
029 * This validator provides methods for validating Internet domain names
030 * and top-level domains.
031 * </p>
032 *
033 * <p>Domain names are evaluated according
034 * to the standards <a href="http://www.ietf.org/rfc/rfc1034.txt">RFC1034</a>,
035 * section 3, and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC1123</a>,
036 * section 2.1. No accommodation is provided for the specialized needs of
037 * other applications; if the domain name has been URL-encoded, for example,
038 * validation will fail even though the equivalent plaintext version of the
039 * same name would have passed.
040 * </p>
041 *
042 * <p>
043 * Validation is also provided for top-level domains (TLDs) as defined and
044 * maintained by the Internet Assigned Numbers Authority (IANA):
045 * </p>
046 *
047 *   <ul>
048 *     <li>{@link #isValidInfrastructureTld} - validates infrastructure TLDs
049 *         (<code>.arpa</code>, etc.)</li>
050 *     <li>{@link #isValidGenericTld} - validates generic TLDs
051 *         (<code>.com, .org</code>, etc.)</li>
052 *     <li>{@link #isValidCountryCodeTld} - validates country code TLDs
053 *         (<code>.us, .uk, .cn</code>, etc.)</li>
054 *   </ul>
055 *
056 * <p>
057 * (<b>NOTE</b>: This class does not provide IP address lookup for domain names or
058 * methods to ensure that a given domain name matches a specific IP; see
059 * {@link java.net.InetAddress} for that functionality.)
060 * </p>
061 *
062 * @version $Revision: 1650777 $
063 * @since Validator 1.4
064 */
065public class DomainValidator implements Serializable {
066
067    private static final long serialVersionUID = -4407125112880174009L;
068
069    // Regular expression strings for hostnames (derived from RFC2396 and RFC 1123)
070
071    // RFC2396: domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
072    // Max 63 characters
073    private static final String DOMAIN_LABEL_REGEX = "\\p{Alnum}(?>[\\p{Alnum}-]{0,61}\\p{Alnum})?";
074
075    // RFC2396 toplabel = alpha | alpha *( alphanum | "-" ) alphanum
076    // Max 63 characters
077    private static final String TOP_LABEL_REGEX = "\\p{Alpha}(?>[\\p{Alnum}-]{0,61}\\p{Alnum})?";
078
079    // RFC2396 hostname = *( domainlabel "." ) toplabel [ "." ]
080    // Note that the regex currently requires both a domain label and a top level label, whereas
081    // the RFC does not. This is because the regex is used to detect if a TLD is present.
082    // If the match fails, input is checked against DOMAIN_LABEL_REGEX (hostnameRegex)
083    // RFC1123 sec 2.1 allows hostnames to start with a digit
084    private static final String DOMAIN_NAME_REGEX =
085            "^(?:" + DOMAIN_LABEL_REGEX + "\\.)+" + "(" + TOP_LABEL_REGEX + ")\\.?$";
086
087    private final boolean allowLocal;
088
089    /**
090     * Singleton instance of this validator, which
091     *  doesn't consider local addresses as valid.
092     */
093    private static final DomainValidator DOMAIN_VALIDATOR = new DomainValidator(false);
094
095    /**
096     * Singleton instance of this validator, which does
097     *  consider local addresses valid.
098     */
099    private static final DomainValidator DOMAIN_VALIDATOR_WITH_LOCAL = new DomainValidator(true);
100
101    /**
102     * RegexValidator for matching domains.
103     */
104    private final RegexValidator domainRegex =
105            new RegexValidator(DOMAIN_NAME_REGEX);
106    /**
107     * RegexValidator for matching a local hostname
108     */
109    // RFC1123 sec 2.1 allows hostnames to start with a digit
110    private final RegexValidator hostnameRegex =
111            new RegexValidator(DOMAIN_LABEL_REGEX);
112
113    /**
114     * Returns the singleton instance of this validator. It
115     *  will not consider local addresses as valid.
116     * @return the singleton instance of this validator
117     */
118    public static DomainValidator getInstance() {
119        return DOMAIN_VALIDATOR;
120    }
121
122    /**
123     * Returns the singleton instance of this validator,
124     *  with local validation as required.
125     * @param allowLocal Should local addresses be considered valid?
126     * @return the singleton instance of this validator
127     */
128    public static DomainValidator getInstance(boolean allowLocal) {
129       if(allowLocal) {
130          return DOMAIN_VALIDATOR_WITH_LOCAL;
131       }
132       return DOMAIN_VALIDATOR;
133    }
134
135    /** Private constructor. */
136    private DomainValidator(boolean allowLocal) {
137       this.allowLocal = allowLocal;
138    }
139
140    /**
141     * Returns true if the specified <code>String</code> parses
142     * as a valid domain name with a recognized top-level domain.
143     * The parsing is case-insensitive.
144     * @param domain the parameter to check for domain name syntax
145     * @return true if the parameter is a valid domain name
146     */
147    public boolean isValid(String domain) {
148        if (domain == null) {
149            return false;
150        }
151        domain = unicodeToASCII(domain);
152        // hosts must be equally reachable via punycode and Unicode;
153        // Unicode is never shorter than punycode, so check punycode
154        // if domain did not convert, then it will be caught by ASCII
155        // checks in the regexes below
156        if (domain.length() > 253) {
157            return false;
158        }
159        String[] groups = domainRegex.match(domain);
160        if (groups != null && groups.length > 0) {
161            return isValidTld(groups[0]);
162        }
163        return allowLocal && hostnameRegex.isValid(domain);
164    }
165
166    // package protected for unit test access
167    // must agree with isValid() above
168    final boolean isValidDomainSyntax(String domain) {
169        if (domain == null) {
170            return false;
171        }
172        domain = unicodeToASCII(domain);
173        // hosts must be equally reachable via punycode and Unicode;
174        // Unicode is never shorter than punycode, so check punycode
175        // if domain did not convert, then it will be caught by ASCII
176        // checks in the regexes below
177        if (domain.length() > 253) {
178            return false;
179        }
180        String[] groups = domainRegex.match(domain);
181        return (groups != null && groups.length > 0)
182                || hostnameRegex.isValid(domain);
183    }
184
185    /**
186     * Returns true if the specified <code>String</code> matches any
187     * IANA-defined top-level domain. Leading dots are ignored if present.
188     * The search is case-insensitive.
189     * @param tld the parameter to check for TLD status, not null
190     * @return true if the parameter is a TLD
191     */
192    public boolean isValidTld(String tld) {
193        tld = unicodeToASCII(tld);
194        if(allowLocal && isValidLocalTld(tld)) {
195           return true;
196        }
197        return isValidInfrastructureTld(tld)
198                || isValidGenericTld(tld)
199                || isValidCountryCodeTld(tld);
200    }
201
202    /**
203     * Returns true if the specified <code>String</code> matches any
204     * IANA-defined infrastructure top-level domain. Leading dots are
205     * ignored if present. The search is case-insensitive.
206     * @param iTld the parameter to check for infrastructure TLD status, not null
207     * @return true if the parameter is an infrastructure TLD
208     */
209    public boolean isValidInfrastructureTld(String iTld) {
210        iTld = unicodeToASCII(iTld);
211        return Arrays.binarySearch(INFRASTRUCTURE_TLDS, (chompLeadingDot(iTld.toLowerCase(Locale.ENGLISH)))) >= 0;
212    }
213
214    /**
215     * Returns true if the specified <code>String</code> matches any
216     * IANA-defined generic top-level domain. Leading dots are ignored
217     * if present. The search is case-insensitive.
218     * @param gTld the parameter to check for generic TLD status, not null
219     * @return true if the parameter is a generic TLD
220     */
221    public boolean isValidGenericTld(String gTld) {
222        gTld = unicodeToASCII(gTld);
223        return Arrays.binarySearch(GENERIC_TLDS, chompLeadingDot(gTld.toLowerCase(Locale.ENGLISH))) >= 0;
224    }
225
226    /**
227     * Returns true if the specified <code>String</code> matches any
228     * IANA-defined country code top-level domain. Leading dots are
229     * ignored if present. The search is case-insensitive.
230     * @param ccTld the parameter to check for country code TLD status, not null
231     * @return true if the parameter is a country code TLD
232     */
233    public boolean isValidCountryCodeTld(String ccTld) {
234        ccTld = unicodeToASCII(ccTld);
235        return Arrays.binarySearch(COUNTRY_CODE_TLDS, chompLeadingDot(ccTld.toLowerCase(Locale.ENGLISH))) >= 0;
236    }
237
238    /**
239     * Returns true if the specified <code>String</code> matches any
240     * widely used "local" domains (localhost or localdomain). Leading dots are
241     * ignored if present. The search is case-insensitive.
242     * @param lTld the parameter to check for local TLD status, not null
243     * @return true if the parameter is an local TLD
244     */
245    public boolean isValidLocalTld(String lTld) {
246        lTld = unicodeToASCII(lTld);
247        return Arrays.binarySearch(LOCAL_TLDS, chompLeadingDot(lTld.toLowerCase(Locale.ENGLISH))) >= 0;
248    }
249
250    private String chompLeadingDot(String str) {
251        if (str.startsWith(".")) {
252            return str.substring(1);
253        }
254        return str;
255    }
256
257    // ---------------------------------------------
258    // ----- TLDs defined by IANA
259    // ----- Authoritative and comprehensive list at:
260    // ----- http://data.iana.org/TLD/tlds-alpha-by-domain.txt
261
262    // Note that the above list is in UPPER case.
263    // The code currently converts strings to lower case (as per the tables below)
264
265    // IANA also provide an HTML list at http://www.iana.org/domains/root/db
266    // Note that this contains several country code entries which are NOT in
267    // the text file. These all have the "Not assigned" in the "Sponsoring Organisation" column
268    // For example (as of 2015-01-02):
269    // .bl  country-code    Not assigned
270    // .um  country-code    Not assigned
271
272    // WARNING: this array MUST be sorted, others it cannot be searched reliably using binary search
273    private static final String[] INFRASTRUCTURE_TLDS = new String[] {
274        "arpa",               // internet infrastructure
275    };
276
277    // WARNING: this array MUST be sorted, others it cannot be searched reliably using binary search
278    private static final String[] GENERIC_TLDS = new String[] {
279        "abogado",
280        "academy",
281        "accountants",
282        "active",
283        "actor",
284        "adult",
285        "aero",
286        "agency",
287        "airforce",
288        "allfinanz",
289        "alsace",
290        "amsterdam",
291        "android",
292        "aquarelle",
293        "archi",
294        "army",
295        "arpa",
296        "asia",
297        "associates",
298        "attorney",
299        "auction",
300        "audio",
301        "autos",
302        "axa",
303        "band",
304        "bar",
305        "bargains",
306        "bayern",
307        "beer",
308        "berlin",
309        "best",
310        "bid",
311        "bike",
312        "bio",
313        "biz",
314        "black",
315        "blackfriday",
316        "bloomberg",
317        "blue",
318        "bmw",
319        "bnpparibas",
320        "boo",
321        "boutique",
322        "brussels",
323        "budapest",
324        "build",
325        "builders",
326        "business",
327        "buzz",
328        "bzh",
329        "cab",
330        "cal",
331        "camera",
332        "camp",
333        "cancerresearch",
334        "capetown",
335        "capital",
336        "caravan",
337        "cards",
338        "care",
339        "career",
340        "careers",
341        "cartier",
342        "casa",
343        "cash",
344        "cat",
345        "catering",
346        "center",
347        "ceo",
348        "cern",
349        "channel",
350        "cheap",
351        "christmas",
352        "chrome",
353        "church",
354        "citic",
355        "city",
356        "claims",
357        "cleaning",
358        "click",
359        "clinic",
360        "clothing",
361        "club",
362        "coach",
363        "codes",
364        "coffee",
365        "college",
366        "cologne",
367        "com",
368        "community",
369        "company",
370        "computer",
371        "condos",
372        "construction",
373        "consulting",
374        "contractors",
375        "cooking",
376        "cool",
377        "coop",
378        "country",
379        "credit",
380        "creditcard",
381        "cricket",
382        "crs",
383        "cruises",
384        "cuisinella",
385        "cymru",
386        "dad",
387        "dance",
388        "dating",
389        "day",
390        "deals",
391        "degree",
392        "delivery",
393        "democrat",
394        "dental",
395        "dentist",
396        "desi",
397        "dev",
398        "diamonds",
399        "diet",
400        "digital",
401        "direct",
402        "directory",
403        "discount",
404        "dnp",
405        "docs",
406        "domains",
407        "doosan",
408        "durban",
409        "dvag",
410        "eat",
411        "edu",
412        "education",
413        "email",
414        "emerck",
415        "energy",
416        "engineer",
417        "engineering",
418        "enterprises",
419        "equipment",
420        "esq",
421        "estate",
422        "eurovision",
423        "eus",
424        "events",
425        "everbank",
426        "exchange",
427        "expert",
428        "exposed",
429        "fail",
430        "farm",
431        "fashion",
432        "feedback",
433        "finance",
434        "financial",
435        "firmdale",
436        "fish",
437        "fishing",
438        "fitness",
439        "flights",
440        "florist",
441        "flowers",
442        "flsmidth",
443        "fly",
444        "foo",
445        "forsale",
446        "foundation",
447        "frl",
448        "frogans",
449        "fund",
450        "furniture",
451        "futbol",
452        "gal",
453        "gallery",
454        "garden",
455        "gbiz",
456        "gent",
457        "ggee",
458        "gift",
459        "gifts",
460        "gives",
461        "glass",
462        "gle",
463        "global",
464        "globo",
465        "gmail",
466        "gmo",
467        "gmx",
468        "google",
469        "gop",
470        "gov",
471        "graphics",
472        "gratis",
473        "green",
474        "gripe",
475        "guide",
476        "guitars",
477        "guru",
478        "hamburg",
479        "haus",
480        "healthcare",
481        "help",
482        "here",
483        "hiphop",
484        "hiv",
485        "holdings",
486        "holiday",
487        "homes",
488        "horse",
489        "host",
490        "hosting",
491        "house",
492        "how",
493        "ibm",
494        "immo",
495        "immobilien",
496        "industries",
497        "info",
498        "ing",
499        "ink",
500        "institute",
501        "insure",
502        "int",
503        "international",
504        "investments",
505        "irish",
506        "iwc",
507        "jetzt",
508        "jobs",
509        "joburg",
510        "juegos",
511        "kaufen",
512        "kim",
513        "kitchen",
514        "kiwi",
515        "koeln",
516        "krd",
517        "kred",
518        "lacaixa",
519        "land",
520        "latrobe",
521        "lawyer",
522        "lds",
523        "lease",
524        "legal",
525        "lgbt",
526        "lidl",
527        "life",
528        "lighting",
529        "limited",
530        "limo",
531        "link",
532        "loans",
533        "london",
534        "lotto",
535        "ltda",
536        "luxe",
537        "luxury",
538        "madrid",
539        "maison",
540        "management",
541        "mango",
542        "market",
543        "marketing",
544        "media",
545        "meet",
546        "melbourne",
547        "meme",
548        "memorial",
549        "menu",
550        "miami",
551        "mil",
552        "mini",
553        "mobi",
554        "moda",
555        "moe",
556        "monash",
557        "money",
558        "mormon",
559        "mortgage",
560        "moscow",
561        "motorcycles",
562        "mov",
563        "museum",
564        "nagoya",
565        "name",
566        "navy",
567        "net",
568        "network",
569        "neustar",
570        "new",
571        "nexus",
572        "ngo",
573        "nhk",
574        "ninja",
575        "nra",
576        "nrw",
577        "nyc",
578        "okinawa",
579        "ong",
580        "onl",
581        "ooo",
582        "org",
583        "organic",
584        "osaka",
585        "otsuka",
586        "ovh",
587        "paris",
588        "partners",
589        "parts",
590        "party",
591        "pharmacy",
592        "photo",
593        "photography",
594        "photos",
595        "physio",
596        "pics",
597        "pictures",
598        "pink",
599        "pizza",
600        "place",
601        "plumbing",
602        "pohl",
603        "poker",
604        "porn",
605        "post",
606        "praxi",
607        "press",
608        "pro",
609        "prod",
610        "productions",
611        "prof",
612        "properties",
613        "property",
614        "pub",
615        "qpon",
616        "quebec",
617        "realtor",
618        "recipes",
619        "red",
620        "rehab",
621        "reise",
622        "reisen",
623        "reit",
624        "ren",
625        "rentals",
626        "repair",
627        "report",
628        "republican",
629        "rest",
630        "restaurant",
631        "reviews",
632        "rich",
633        "rio",
634        "rip",
635        "rocks",
636        "rodeo",
637        "rsvp",
638        "ruhr",
639        "ryukyu",
640        "saarland",
641        "sale",
642        "samsung",
643        "sarl",
644        "sca",
645        "scb",
646        "schmidt",
647        "schule",
648        "schwarz",
649        "science",
650        "scot",
651        "services",
652        "sew",
653        "sexy",
654        "shiksha",
655        "shoes",
656        "shriram",
657        "singles",
658        "sky",
659        "social",
660        "software",
661        "sohu",
662        "solar",
663        "solutions",
664        "soy",
665        "space",
666        "spiegel",
667        "supplies",
668        "supply",
669        "support",
670        "surf",
671        "surgery",
672        "suzuki",
673        "sydney",
674        "systems",
675        "taipei",
676        "tatar",
677        "tattoo",
678        "tax",
679        "technology",
680        "tel",
681        "tienda",
682        "tips",
683        "tires",
684        "tirol",
685        "today",
686        "tokyo",
687        "tools",
688        "top",
689        "town",
690        "toys",
691        "trade",
692        "training",
693        "travel",
694        "trust",
695        "tui",
696        "university",
697        "uno",
698        "uol",
699        "vacations",
700        "vegas",
701        "ventures",
702        "versicherung",
703        "vet",
704        "viajes",
705        "video",
706        "villas",
707        "vision",
708        "vlaanderen",
709        "vodka",
710        "vote",
711        "voting",
712        "voto",
713        "voyage",
714        "wales",
715        "wang",
716        "watch",
717        "webcam",
718        "website",
719        "wed",
720        "wedding",
721        "whoswho",
722        "wien",
723        "wiki",
724        "williamhill",
725        "wme",
726        "work",
727        "works",
728        "world",
729        "wtc",
730        "wtf",
731        "xn--1qqw23a", // 佛山 Guangzhou YU Wei Information Technology Co., Ltd.
732        "xn--3bst00m", // 集团 Eagle Horizon Limited
733        "xn--3ds443g", // 在线 TLD REGISTRY LIMITED
734        "xn--45q11c", // 八卦 Zodiac Scorpio Limited
735        "xn--4gbrim", // موقع Suhub Electronic Establishment
736        "xn--55qw42g", // 公益 China Organizational Name Administration Center
737        "xn--55qx5d", // 公司 Computer Network Information Center of Chinese Academy of Sciences (China Internet Network Information Center)
738        "xn--6frz82g", // 移动 Afilias Limited
739        "xn--6qq986b3xl", // 我爱你 Tycoon Treasure Limited
740        "xn--80adxhks", // москва Foundation for Assistance for Internet Technologies and Infrastructure Development (FAITID)
741        "xn--80asehdb", // онлайн CORE Association
742        "xn--80aswg", // сайт CORE Association
743        "xn--c1avg", // орг Public Interest Registry
744        "xn--cg4bki", // 삼성 SAMSUNG SDS CO., LTD
745        "xn--czr694b", // 商标 HU YI GLOBAL INFORMATION RESOURCES(HOLDING) COMPANY.HONGKONG LIMITED
746        "xn--czrs0t", // 商店 Wild Island, LLC
747        "xn--czru2d", // 商城 Zodiac Aquarius Limited
748        "xn--d1acj3b", // дети The Foundation for Network Initiatives “The Smart Internet”
749        "xn--fiq228c5hs", // 中文网 TLD REGISTRY LIMITED
750        "xn--fiq64b", // 中信 CITIC Group Corporation
751        "xn--flw351e", // 谷歌 Charleston Road Registry Inc.
752        "xn--hxt814e", // 网店 Zodiac Libra Limited
753        "xn--i1b6b1a6a2e", // संगठन Public Interest Registry
754        "xn--io0a7i", // 网络 Computer Network Information Center of Chinese Academy of Sciences (China Internet Network Information Center)
755        "xn--kput3i", // 手机 Beijing RITT-Net Technology Development Co., Ltd
756        "xn--mgbab2bd", // بازار CORE Association
757        "xn--ngbc5azd", // شبكة International Domain Registry Pty. Ltd.
758        "xn--nqv7f", // 机构 Public Interest Registry
759        "xn--nqv7fs00ema", // 组织机构 Public Interest Registry
760        "xn--p1acf", // рус Rusnames Limited
761        "xn--q9jyb4c", // みんな Charleston Road Registry Inc.
762        "xn--qcka1pmc", // グーグル Charleston Road Registry Inc.
763        "xn--rhqv96g", // 世界 Stable Tone Limited
764        "xn--ses554g", // 网址 HU YI GLOBAL INFORMATION RESOURCES (HOLDING) COMPANY. HONGKONG LIMITED
765        "xn--unup4y", // 游戏 Spring Fields, LLC
766        "xn--vermgensberater-ctb", // vermögensberater Deutsche Vermögensberatung Aktiengesellschaft DVAG
767        "xn--vermgensberatung-pwb", // vermögensberatung Deutsche Vermögensberatung Aktiengesellschaft DVAG
768        "xn--vhquv", // 企业 Dash McCook, LLC
769        "xn--xhq521b", // 广东 Guangzhou YU Wei Information Technology Co., Ltd.
770        "xn--zfr164b", // 政务 China Organizational Name Administration Center
771        "xxx",
772        "xyz",
773        "yachts",
774        "yandex",
775        "yoga",
776        "yokohama",
777        "youtube",
778        "zip",
779        "zone",
780        "zuerich",
781   };
782
783    // WARNING: this array MUST be sorted, others it cannot be searched reliably using binary search
784    private static final String[] COUNTRY_CODE_TLDS = new String[] {
785        "ac",                 // Ascension Island
786        "ad",                 // Andorra
787        "ae",                 // United Arab Emirates
788        "af",                 // Afghanistan
789        "ag",                 // Antigua and Barbuda
790        "ai",                 // Anguilla
791        "al",                 // Albania
792        "am",                 // Armenia
793        "an",                 // Netherlands Antilles
794        "ao",                 // Angola
795        "aq",                 // Antarctica
796        "ar",                 // Argentina
797        "as",                 // American Samoa
798        "at",                 // Austria
799        "au",                 // Australia (includes Ashmore and Cartier Islands and Coral Sea Islands)
800        "aw",                 // Aruba
801        "ax",                 // Åland
802        "az",                 // Azerbaijan
803        "ba",                 // Bosnia and Herzegovina
804        "bb",                 // Barbados
805        "bd",                 // Bangladesh
806        "be",                 // Belgium
807        "bf",                 // Burkina Faso
808        "bg",                 // Bulgaria
809        "bh",                 // Bahrain
810        "bi",                 // Burundi
811        "bj",                 // Benin
812        "bm",                 // Bermuda
813        "bn",                 // Brunei Darussalam
814        "bo",                 // Bolivia
815        "br",                 // Brazil
816        "bs",                 // Bahamas
817        "bt",                 // Bhutan
818        "bv",                 // Bouvet Island
819        "bw",                 // Botswana
820        "by",                 // Belarus
821        "bz",                 // Belize
822        "ca",                 // Canada
823        "cc",                 // Cocos (Keeling) Islands
824        "cd",                 // Democratic Republic of the Congo (formerly Zaire)
825        "cf",                 // Central African Republic
826        "cg",                 // Republic of the Congo
827        "ch",                 // Switzerland
828        "ci",                 // Côte d'Ivoire
829        "ck",                 // Cook Islands
830        "cl",                 // Chile
831        "cm",                 // Cameroon
832        "cn",                 // China, mainland
833        "co",                 // Colombia
834        "cr",                 // Costa Rica
835        "cu",                 // Cuba
836        "cv",                 // Cape Verde
837        "cw",                 // Curaçao
838        "cx",                 // Christmas Island
839        "cy",                 // Cyprus
840        "cz",                 // Czech Republic
841        "de",                 // Germany
842        "dj",                 // Djibouti
843        "dk",                 // Denmark
844        "dm",                 // Dominica
845        "do",                 // Dominican Republic
846        "dz",                 // Algeria
847        "ec",                 // Ecuador
848        "ee",                 // Estonia
849        "eg",                 // Egypt
850        "er",                 // Eritrea
851        "es",                 // Spain
852        "et",                 // Ethiopia
853        "eu",                 // European Union
854        "fi",                 // Finland
855        "fj",                 // Fiji
856        "fk",                 // Falkland Islands
857        "fm",                 // Federated States of Micronesia
858        "fo",                 // Faroe Islands
859        "fr",                 // France
860        "ga",                 // Gabon
861        "gb",                 // Great Britain (United Kingdom)
862        "gd",                 // Grenada
863        "ge",                 // Georgia
864        "gf",                 // French Guiana
865        "gg",                 // Guernsey
866        "gh",                 // Ghana
867        "gi",                 // Gibraltar
868        "gl",                 // Greenland
869        "gm",                 // The Gambia
870        "gn",                 // Guinea
871        "gp",                 // Guadeloupe
872        "gq",                 // Equatorial Guinea
873        "gr",                 // Greece
874        "gs",                 // South Georgia and the South Sandwich Islands
875        "gt",                 // Guatemala
876        "gu",                 // Guam
877        "gw",                 // Guinea-Bissau
878        "gy",                 // Guyana
879        "hk",                 // Hong Kong
880        "hm",                 // Heard Island and McDonald Islands
881        "hn",                 // Honduras
882        "hr",                 // Croatia (Hrvatska)
883        "ht",                 // Haiti
884        "hu",                 // Hungary
885        "id",                 // Indonesia
886        "ie",                 // Ireland (Éire)
887        "il",                 // Israel
888        "im",                 // Isle of Man
889        "in",                 // India
890        "io",                 // British Indian Ocean Territory
891        "iq",                 // Iraq
892        "ir",                 // Iran
893        "is",                 // Iceland
894        "it",                 // Italy
895        "je",                 // Jersey
896        "jm",                 // Jamaica
897        "jo",                 // Jordan
898        "jp",                 // Japan
899        "ke",                 // Kenya
900        "kg",                 // Kyrgyzstan
901        "kh",                 // Cambodia (Khmer)
902        "ki",                 // Kiribati
903        "km",                 // Comoros
904        "kn",                 // Saint Kitts and Nevis
905        "kp",                 // North Korea
906        "kr",                 // South Korea
907        "kw",                 // Kuwait
908        "ky",                 // Cayman Islands
909        "kz",                 // Kazakhstan
910        "la",                 // Laos (currently being marketed as the official domain for Los Angeles)
911        "lb",                 // Lebanon
912        "lc",                 // Saint Lucia
913        "li",                 // Liechtenstein
914        "lk",                 // Sri Lanka
915        "lr",                 // Liberia
916        "ls",                 // Lesotho
917        "lt",                 // Lithuania
918        "lu",                 // Luxembourg
919        "lv",                 // Latvia
920        "ly",                 // Libya
921        "ma",                 // Morocco
922        "mc",                 // Monaco
923        "md",                 // Moldova
924        "me",                 // Montenegro
925        "mg",                 // Madagascar
926        "mh",                 // Marshall Islands
927        "mk",                 // Republic of Macedonia
928        "ml",                 // Mali
929        "mm",                 // Myanmar
930        "mn",                 // Mongolia
931        "mo",                 // Macau
932        "mp",                 // Northern Mariana Islands
933        "mq",                 // Martinique
934        "mr",                 // Mauritania
935        "ms",                 // Montserrat
936        "mt",                 // Malta
937        "mu",                 // Mauritius
938        "mv",                 // Maldives
939        "mw",                 // Malawi
940        "mx",                 // Mexico
941        "my",                 // Malaysia
942        "mz",                 // Mozambique
943        "na",                 // Namibia
944        "nc",                 // New Caledonia
945        "ne",                 // Niger
946        "nf",                 // Norfolk Island
947        "ng",                 // Nigeria
948        "ni",                 // Nicaragua
949        "nl",                 // Netherlands
950        "no",                 // Norway
951        "np",                 // Nepal
952        "nr",                 // Nauru
953        "nu",                 // Niue
954        "nz",                 // New Zealand
955        "om",                 // Oman
956        "pa",                 // Panama
957        "pe",                 // Peru
958        "pf",                 // French Polynesia With Clipperton Island
959        "pg",                 // Papua New Guinea
960        "ph",                 // Philippines
961        "pk",                 // Pakistan
962        "pl",                 // Poland
963        "pm",                 // Saint-Pierre and Miquelon
964        "pn",                 // Pitcairn Islands
965        "pr",                 // Puerto Rico
966        "ps",                 // Palestinian territories (PA-controlled West Bank and Gaza Strip)
967        "pt",                 // Portugal
968        "pw",                 // Palau
969        "py",                 // Paraguay
970        "qa",                 // Qatar
971        "re",                 // Réunion
972        "ro",                 // Romania
973        "rs",                 // Serbia
974        "ru",                 // Russia
975        "rw",                 // Rwanda
976        "sa",                 // Saudi Arabia
977        "sb",                 // Solomon Islands
978        "sc",                 // Seychelles
979        "sd",                 // Sudan
980        "se",                 // Sweden
981        "sg",                 // Singapore
982        "sh",                 // Saint Helena
983        "si",                 // Slovenia
984        "sj",                 // Svalbard and Jan Mayen Islands Not in use (Norwegian dependencies; see .no)
985        "sk",                 // Slovakia
986        "sl",                 // Sierra Leone
987        "sm",                 // San Marino
988        "sn",                 // Senegal
989        "so",                 // Somalia
990        "sr",                 // Suriname
991        "st",                 // São Tomé and Príncipe
992        "su",                 // Soviet Union (deprecated)
993        "sv",                 // El Salvador
994        "sx",                 // Sint Maarten
995        "sy",                 // Syria
996        "sz",                 // Swaziland
997        "tc",                 // Turks and Caicos Islands
998        "td",                 // Chad
999        "tf",                 // French Southern and Antarctic Lands
1000        "tg",                 // Togo
1001        "th",                 // Thailand
1002        "tj",                 // Tajikistan
1003        "tk",                 // Tokelau
1004        "tl",                 // East Timor (deprecated old code)
1005        "tm",                 // Turkmenistan
1006        "tn",                 // Tunisia
1007        "to",                 // Tonga
1008        "tp",                 // East Timor
1009        "tr",                 // Turkey
1010        "tt",                 // Trinidad and Tobago
1011        "tv",                 // Tuvalu
1012        "tw",                 // Taiwan, Republic of China
1013        "tz",                 // Tanzania
1014        "ua",                 // Ukraine
1015        "ug",                 // Uganda
1016        "uk",                 // United Kingdom
1017        "us",                 // United States of America
1018        "uy",                 // Uruguay
1019        "uz",                 // Uzbekistan
1020        "va",                 // Vatican City State
1021        "vc",                 // Saint Vincent and the Grenadines
1022        "ve",                 // Venezuela
1023        "vg",                 // British Virgin Islands
1024        "vi",                 // U.S. Virgin Islands
1025        "vn",                 // Vietnam
1026        "vu",                 // Vanuatu
1027        "wf",                 // Wallis and Futuna
1028        "ws",                 // Samoa (formerly Western Samoa)
1029        "xn--3e0b707e", // 한국 KISA (Korea Internet &amp; Security Agency)
1030        "xn--45brj9c", // ভারত National Internet Exchange of India
1031        "xn--80ao21a", // қаз Association of IT Companies of Kazakhstan
1032        "xn--90a3ac", // срб Serbian National Internet Domain Registry (RNIDS)
1033        "xn--clchc0ea0b2g2a9gcd", // சிங்கப்பூர் Singapore Network Information Centre (SGNIC) Pte Ltd
1034        "xn--d1alf", // мкд Macedonian Academic Research Network Skopje
1035        "xn--fiqs8s", // 中国 China Internet Network Information Center
1036        "xn--fiqz9s", // 中國 China Internet Network Information Center
1037        "xn--fpcrj9c3d", // భారత్ National Internet Exchange of India
1038        "xn--fzc2c9e2c", // ලංකා LK Domain Registry
1039        "xn--gecrj9c", // ભારત National Internet Exchange of India
1040        "xn--h2brj9c", // भारत National Internet Exchange of India
1041        "xn--j1amh", // укр Ukrainian Network Information Centre (UANIC), Inc.
1042        "xn--j6w193g", // 香港 Hong Kong Internet Registration Corporation Ltd.
1043        "xn--kprw13d", // 台湾 Taiwan Network Information Center (TWNIC)
1044        "xn--kpry57d", // 台灣 Taiwan Network Information Center (TWNIC)
1045        "xn--l1acc", // мон Datacom Co.,Ltd
1046        "xn--lgbbat1ad8j", // الجزائر CERIST
1047        "xn--mgb9awbf", // عمان Telecommunications Regulatory Authority (TRA)
1048        "xn--mgba3a4f16a", // ایران Institute for Research in Fundamental Sciences (IPM)
1049        "xn--mgbaam7a8h", // امارات Telecommunications Regulatory Authority (TRA)
1050        "xn--mgbayh7gpa", // الاردن National Information Technology Center (NITC)
1051        "xn--mgbbh1a71e", // بھارت National Internet Exchange of India
1052        "xn--mgbc0a9azcg", // المغرب Agence Nationale de Réglementation des Télécommunications (ANRT)
1053        "xn--mgberp4a5d4ar", // السعودية Communications and Information Technology Commission
1054        "xn--mgbx4cd0ab", // مليسيا MYNIC Berhad
1055        "xn--node", // გე Information Technologies Development Center (ITDC)
1056        "xn--o3cw4h", // ไทย Thai Network Information Center Foundation
1057        "xn--ogbpf8fl", // سورية National Agency for Network Services (NANS)
1058        "xn--p1ai", // рф Coordination Center for TLD RU
1059        "xn--pgbs0dh", // تونس Agence Tunisienne d&#39;Internet
1060        "xn--s9brj9c", // ਭਾਰਤ National Internet Exchange of India
1061        "xn--wgbh1c", // مصر National Telecommunication Regulatory Authority - NTRA
1062        "xn--wgbl6a", // قطر Communications Regulatory Authority
1063        "xn--xkc2al3hye2a", // இலங்கை LK Domain Registry
1064        "xn--xkc2dl3a5ee0h", // இந்தியா National Internet Exchange of India
1065        "xn--yfro4i67o", // 新加坡 Singapore Network Information Centre (SGNIC) Pte Ltd
1066        "xn--ygbi2ammx", // فلسطين Ministry of Telecom &amp; Information Technology (MTIT)
1067        "ye",                 // Yemen
1068        "yt",                 // Mayotte
1069        "za",                 // South Africa
1070        "zm",                 // Zambia
1071        "zw",                 // Zimbabwe
1072    };
1073
1074    // WARNING: this array MUST be sorted, others it cannot be searched reliably using binary search
1075    private static final String[] LOCAL_TLDS = new String[] {
1076       "localdomain",         // Also widely used as localhost.localdomain
1077       "localhost",           // RFC2606 defined
1078    };
1079
1080    /**
1081     * Converts potentially Unicode input to punycode.
1082     * If conversion fails, returns the original input.
1083     * 
1084     * @param input the string to convert, not null
1085     * @return converted input, or original input if conversion fails
1086     */
1087    // Needed by UrlValidator
1088    static String unicodeToASCII(String input) {
1089        try {
1090            return /* java.net.IDN. */ toASCII(input);
1091        } catch (IllegalArgumentException e) { // input is not valid
1092            return input;
1093        }
1094    }
1095
1096    // ================= Code needed for Java 1.4 and 1.5 compatibility ===============
1097
1098    private static class IDNHolder {
1099        private static Method getMethod() {
1100            try {
1101                Class clazz = Class.forName("java.net.IDN", false, DomainValidator.class.getClassLoader());
1102                return clazz.getDeclaredMethod("toASCII", new Class[]{String.class});
1103            } catch (Exception e) {
1104              return null;
1105            }
1106        }
1107        private static final Method JAVA_NET_IDN_TO_ASCII = getMethod();
1108    }
1109
1110    /*
1111     * Helper method to invoke java.net.IDN.toAscii(String).
1112     * Allows code to be compiled with Java 1.4 and 1.5 
1113     * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
1114     */
1115    private static final String toASCII(String line) throws IllegalArgumentException {
1116//        java.net.IDN.toASCII(line); // Java 1.6+
1117        // implementation for Java 1.4 and 1.5
1118        // effectively this is done by IDN.toASCII but we want to skip the entire call
1119        if (isOnlyASCII(line)) {
1120            return line;
1121        }
1122        Method m = IDNHolder.JAVA_NET_IDN_TO_ASCII;
1123        if (m == null) { // avoid NPE
1124            return line;
1125        }
1126        try {
1127            return (String) m.invoke(null, new String[]{line.toLowerCase(Locale.ENGLISH)});
1128        } catch (IllegalAccessException e) {
1129            throw new RuntimeException(e); // Should not happen
1130        } catch (InvocationTargetException e) {
1131            Throwable t = e.getCause();
1132            if (t instanceof IllegalArgumentException) { // this is expected from toASCII method
1133                throw (IllegalArgumentException) t;
1134            }
1135            throw new RuntimeException(e); // Should not happen
1136        }
1137    }
1138
1139    /*
1140     * Check if input contains only ASCII
1141     * Treats null as all ASCII
1142     */
1143    private static boolean isOnlyASCII(String input) {
1144        if (input == null) {
1145            return true;
1146        }
1147        for(int i=0; i < input.length(); i++) {
1148            if (input.charAt(i) > 0x7F) {
1149                return false;
1150            }
1151        }
1152        return true;
1153    }
1154
1155}