View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.lang;
19  
20  import java.io.IOException;
21  import java.io.StringWriter;
22  import java.io.Writer;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.TreeMap;
26  
27  /**
28   * <p>
29   * Provides HTML and XML entity utilities.
30   * </p>
31   * 
32   * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
33   * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
34   * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
35   * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
36   * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
37   * 
38   * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
39   * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
40   * @since 2.0
41   * @version $Id: Entities.java 636641 2008-03-13 06:11:30Z bayard $
42   */
43  class Entities {
44  
45      private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
46          {"amp", "38"}, // & - ampersand
47          {"lt", "60"}, // < - less-than
48          {"gt", "62"}, // > - greater-than
49      };
50  
51      private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
52      };
53  
54      // package scoped for testing
55      static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
56          {"iexcl", "161"}, // inverted exclamation mark
57          {"cent", "162"}, // cent sign
58          {"pound", "163"}, // pound sign
59          {"curren", "164"}, // currency sign
60          {"yen", "165"}, // yen sign = yuan sign
61          {"brvbar", "166"}, // broken bar = broken vertical bar
62          {"sect", "167"}, // section sign
63          {"uml", "168"}, // diaeresis = spacing diaeresis
64          {"copy", "169"}, // © - copyright sign
65          {"ordf", "170"}, // feminine ordinal indicator
66          {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
67          {"not", "172"}, // not sign
68          {"shy", "173"}, // soft hyphen = discretionary hyphen
69          {"reg", "174"}, // ® - registered trademark sign
70          {"macr", "175"}, // macron = spacing macron = overline = APL overbar
71          {"deg", "176"}, // degree sign
72          {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
73          {"sup2", "178"}, // superscript two = superscript digit two = squared
74          {"sup3", "179"}, // superscript three = superscript digit three = cubed
75          {"acute", "180"}, // acute accent = spacing acute
76          {"micro", "181"}, // micro sign
77          {"para", "182"}, // pilcrow sign = paragraph sign
78          {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
79          {"cedil", "184"}, // cedilla = spacing cedilla
80          {"sup1", "185"}, // superscript one = superscript digit one
81          {"ordm", "186"}, // masculine ordinal indicator
82          {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
83          {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
84          {"frac12", "189"}, // vulgar fraction one half = fraction one half
85          {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
86          {"iquest", "191"}, // inverted question mark = turned question mark
87          {"Agrave", "192"}, // À - uppercase A, grave accent
88          {"Aacute", "193"}, // Á - uppercase A, acute accent
89          {"Acirc", "194"}, // Â - uppercase A, circumflex accent
90          {"Atilde", "195"}, // Ã - uppercase A, tilde
91          {"Auml", "196"}, // Ä - uppercase A, umlaut
92          {"Aring", "197"}, // Å - uppercase A, ring
93          {"AElig", "198"}, // Æ - uppercase AE
94          {"Ccedil", "199"}, // Ç - uppercase C, cedilla
95          {"Egrave", "200"}, // È - uppercase E, grave accent
96          {"Eacute", "201"}, // É - uppercase E, acute accent
97          {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
98          {"Euml", "203"}, // Ë - uppercase E, umlaut
99          {"Igrave", "204"}, // Ì - uppercase I, grave accent
100         {"Iacute", "205"}, // Í - uppercase I, acute accent
101         {"Icirc", "206"}, // Î - uppercase I, circumflex accent
102         {"Iuml", "207"}, // Ï - uppercase I, umlaut
103         {"ETH", "208"}, // Ð - uppercase Eth, Icelandic
104         {"Ntilde", "209"}, // Ñ - uppercase N, tilde
105         {"Ograve", "210"}, // Ò - uppercase O, grave accent
106         {"Oacute", "211"}, // Ó - uppercase O, acute accent
107         {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
108         {"Otilde", "213"}, // Õ - uppercase O, tilde
109         {"Ouml", "214"}, // Ö - uppercase O, umlaut
110         {"times", "215"}, // multiplication sign
111         {"Oslash", "216"}, // Ø - uppercase O, slash
112         {"Ugrave", "217"}, // Ù - uppercase U, grave accent
113         {"Uacute", "218"}, // Ú - uppercase U, acute accent
114         {"Ucirc", "219"}, // Û - uppercase U, circumflex accent
115         {"Uuml", "220"}, // Ü - uppercase U, umlaut
116         {"Yacute", "221"}, // Ý - uppercase Y, acute accent
117         {"THORN", "222"}, // Þ - uppercase THORN, Icelandic
118         {"szlig", "223"}, // ß - lowercase sharps, German
119         {"agrave", "224"}, // à - lowercase a, grave accent
120         {"aacute", "225"}, // á - lowercase a, acute accent
121         {"acirc", "226"}, // â - lowercase a, circumflex accent
122         {"atilde", "227"}, // ã - lowercase a, tilde
123         {"auml", "228"}, // ä - lowercase a, umlaut
124         {"aring", "229"}, // å - lowercase a, ring
125         {"aelig", "230"}, // æ - lowercase ae
126         {"ccedil", "231"}, // ç - lowercase c, cedilla
127         {"egrave", "232"}, // è - lowercase e, grave accent
128         {"eacute", "233"}, // é - lowercase e, acute accent
129         {"ecirc", "234"}, // ê - lowercase e, circumflex accent
130         {"euml", "235"}, // ë - lowercase e, umlaut
131         {"igrave", "236"}, // ì - lowercase i, grave accent
132         {"iacute", "237"}, // í - lowercase i, acute accent
133         {"icirc", "238"}, // î - lowercase i, circumflex accent
134         {"iuml", "239"}, // ï - lowercase i, umlaut
135         {"eth", "240"}, // ð - lowercase eth, Icelandic
136         {"ntilde", "241"}, // ñ - lowercase n, tilde
137         {"ograve", "242"}, // ò - lowercase o, grave accent
138         {"oacute", "243"}, // ó - lowercase o, acute accent
139         {"ocirc", "244"}, // ô - lowercase o, circumflex accent
140         {"otilde", "245"}, // õ - lowercase o, tilde
141         {"ouml", "246"}, // ö - lowercase o, umlaut
142         {"divide", "247"}, // division sign
143         {"oslash", "248"}, // ø - lowercase o, slash
144         {"ugrave", "249"}, // ù - lowercase u, grave accent
145         {"uacute", "250"}, // ú - lowercase u, acute accent
146         {"ucirc", "251"}, // û - lowercase u, circumflex accent
147         {"uuml", "252"}, // ü - lowercase u, umlaut
148         {"yacute", "253"}, // ý - lowercase y, acute accent
149         {"thorn", "254"}, // þ - lowercase thorn, Icelandic
150         {"yuml", "255"}, // ÿ - lowercase y, umlaut
151     };
152 
153     // http://www.w3.org/TR/REC-html40/sgml/entities.html
154     // package scoped for testing
155     static final String[][] HTML40_ARRAY = {
156     // <!-- Latin Extended-B -->
157         {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
158         // <!-- Greek -->
159         {"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
160         {"Beta", "914"}, // greek capital letter beta, U+0392 -->
161         {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
162         {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
163         {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
164         {"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
165         {"Eta", "919"}, // greek capital letter eta, U+0397 -->
166         {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
167         {"Iota", "921"}, // greek capital letter iota, U+0399 -->
168         {"Kappa", "922"}, // greek capital letter kappa, U+039A -->
169         {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
170         {"Mu", "924"}, // greek capital letter mu, U+039C -->
171         {"Nu", "925"}, // greek capital letter nu, U+039D -->
172         {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
173         {"Omicron", "927"}, // greek capital letter omicron, U+039F -->
174         {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
175         {"Rho", "929"}, // greek capital letter rho, U+03A1 -->
176         // <!-- there is no Sigmaf, and no U+03A2 character either -->
177         {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
178         {"Tau", "932"}, // greek capital letter tau, U+03A4 -->
179         {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
180         {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
181         {"Chi", "935"}, // greek capital letter chi, U+03A7 -->
182         {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
183         {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
184         {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
185         {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
186         {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
187         {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
188         {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
189         {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
190         {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
191         {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
192         {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
193         {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
194         {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
195         {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
196         {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
197         {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
198         {"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
199         {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
200         {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
201         {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
202         {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
203         {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
204         {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
205         {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
206         {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
207         {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
208         {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
209         {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
210         {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
211         {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
212         // <!-- General Punctuation -->
213         {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
214         // <!-- bullet is NOT the same as bullet operator, U+2219 -->
215         {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
216         {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
217         {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
218         {"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
219         {"frasl", "8260"}, // fraction slash, U+2044 NEW -->
220         // <!-- Letterlike Symbols -->
221         {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
222         {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
223         {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
224         {"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
225         {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
226         // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
227         // same glyph could be used to depict both characters -->
228         // <!-- Arrows -->
229         {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
230         {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
231         {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
232         {"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
233         {"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
234         {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
235         {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
236         // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
237         // arrow but also does not have any other character for that function.
238         // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
239         {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
240         {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
241         // <!-- ISO 10646 does not say this is the 'implies' character but does not
242         // have another character with this function so ?rArr can be used for
243         // 'implies' as ISOtech suggests -->
244         {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
245         {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
246         // <!-- Mathematical Operators -->
247         {"forall", "8704"}, // for all, U+2200 ISOtech -->
248         {"part", "8706"}, // partial differential, U+2202 ISOtech -->
249         {"exist", "8707"}, // there exists, U+2203 ISOtech -->
250         {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
251         {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
252         {"isin", "8712"}, // element of, U+2208 ISOtech -->
253         {"notin", "8713"}, // not an element of, U+2209 ISOtech -->
254         {"ni", "8715"}, // contains as member, U+220B ISOtech -->
255         // <!-- should there be a more memorable name than 'ni'? -->
256         {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
257         // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
258         // though the same glyph might be used for both -->
259         {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
260         // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
261         // though the same glyph might be used for both -->
262         {"minus", "8722"}, // minus sign, U+2212 ISOtech -->
263         {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
264         {"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
265         {"prop", "8733"}, // proportional to, U+221D ISOtech -->
266         {"infin", "8734"}, // infinity, U+221E ISOtech -->
267         {"ang", "8736"}, // angle, U+2220 ISOamso -->
268         {"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
269         {"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
270         {"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
271         {"cup", "8746"}, // union = cup, U+222A ISOtech -->
272         {"int", "8747"}, // integral, U+222B ISOtech -->
273         {"there4", "8756"}, // therefore, U+2234 ISOtech -->
274         {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
275         // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
276         // the same glyph might be used to represent both -->
277         {"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
278         {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
279         {"ne", "8800"}, // not equal to, U+2260 ISOtech -->
280         {"equiv", "8801"}, // identical to, U+2261 ISOtech -->
281         {"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
282         {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
283         {"sub", "8834"}, // subset of, U+2282 ISOtech -->
284         {"sup", "8835"}, // superset of, U+2283 ISOtech -->
285         // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
286         // Symbol font encoding and is not included. Should it be, for symmetry?
287         // It is in ISOamsn --> <!ENTITY nsub", "8836"},
288         // not a subset of, U+2284 ISOamsn -->
289         {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
290         {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
291         {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
292         {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
293         {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
294         {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
295         // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
296         // <!-- Miscellaneous Technical -->
297         {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
298         {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
299         {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
300         {"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
301         {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
302         // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
303         // mark' -->
304         {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
305         // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
306         // 'single right-pointing angle quotation mark' -->
307         // <!-- Geometric Shapes -->
308         {"loz", "9674"}, // lozenge, U+25CA ISOpub -->
309         // <!-- Miscellaneous Symbols -->
310         {"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
311         // <!-- black here seems to mean filled as opposed to hollow -->
312         {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
313         {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
314         {"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
315 
316         // <!-- Latin Extended-A -->
317         {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
318         {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
319         // <!-- ligature is a misnomer, this is a separate character in some languages -->
320         {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
321         {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
322         {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
323         // <!-- Spacing Modifier Letters -->
324         {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
325         {"tilde", "732"}, // small tilde, U+02DC ISOdia -->
326         // <!-- General Punctuation -->
327         {"ensp", "8194"}, // en space, U+2002 ISOpub -->
328         {"emsp", "8195"}, // em space, U+2003 ISOpub -->
329         {"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
330         {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
331         {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
332         {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
333         {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
334         {"ndash", "8211"}, // en dash, U+2013 ISOpub -->
335         {"mdash", "8212"}, // em dash, U+2014 ISOpub -->
336         {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
337         {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
338         {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
339         {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
340         {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
341         {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
342         {"dagger", "8224"}, // dagger, U+2020 ISOpub -->
343         {"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
344         {"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
345         {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
346         // <!-- lsaquo is proposed but not yet ISO standardized -->
347         {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
348         // <!-- rsaquo is proposed but not yet ISO standardized -->
349         {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
350     };
351 
352     /**
353      * <p>
354      * The set of entities supported by standard XML.
355      * </p>
356      */
357     public static final Entities XML;
358 
359     /**
360      * <p>
361      * The set of entities supported by HTML 3.2.
362      * </p>
363      */
364     public static final Entities HTML32;
365 
366     /**
367      * <p>
368      * The set of entities supported by HTML 4.0.
369      * </p>
370      */
371     public static final Entities HTML40;
372 
373     static {
374         XML = new Entities();
375         XML.addEntities(BASIC_ARRAY);
376         XML.addEntities(APOS_ARRAY);
377     }
378 
379     static {
380         HTML32 = new Entities();
381         HTML32.addEntities(BASIC_ARRAY);
382         HTML32.addEntities(ISO8859_1_ARRAY);
383     }
384 
385     static {
386         HTML40 = new Entities();
387         fillWithHtml40Entities(HTML40);
388     }
389 
390     /**
391      * <p>
392      * Fills the specified entities instance with HTML 40 entities.
393      * </p>
394      * 
395      * @param entities
396      *            the instance to be filled.
397      */
398     static void fillWithHtml40Entities(Entities entities) {
399         entities.addEntities(BASIC_ARRAY);
400         entities.addEntities(ISO8859_1_ARRAY);
401         entities.addEntities(HTML40_ARRAY);
402     }
403 
404     static interface EntityMap {
405         /**
406          * <p>
407          * Add an entry to this entity map.
408          * </p>
409          * 
410          * @param name
411          *            the entity name
412          * @param value
413          *            the entity value
414          */
415         void add(String name, int value);
416 
417         /**
418          * <p>
419          * Returns the name of the entity identified by the specified value.
420          * </p>
421          * 
422          * @param value
423          *            the value to locate
424          * @return entity name associated with the specified value
425          */
426         String name(int value);
427 
428         /**
429          * <p>
430          * Returns the value of the entity identified by the specified name.
431          * </p>
432          * 
433          * @param name
434          *            the name to locate
435          * @return entity value associated with the specified name
436          */
437         int value(String name);
438     }
439 
440     static class PrimitiveEntityMap implements EntityMap {
441         private Map mapNameToValue = new HashMap();
442 
443         private IntHashMap mapValueToName = new IntHashMap();
444 
445         /**
446          * {@inheritDoc}
447          */
448         public void add(String name, int value) {
449             mapNameToValue.put(name, new Integer(value));
450             mapValueToName.put(value, name);
451         }
452 
453         /**
454          * {@inheritDoc}
455          */
456         public String name(int value) {
457             return (String) mapValueToName.get(value);
458         }
459 
460         /**
461          * {@inheritDoc}
462          */
463         public int value(String name) {
464             Object value = mapNameToValue.get(name);
465             if (value == null) {
466                 return -1;
467             }
468             return ((Integer) value).intValue();
469         }
470     }
471 
472     static abstract class MapIntMap implements Entities.EntityMap {
473         protected Map mapNameToValue;
474 
475         protected Map mapValueToName;
476 
477         /**
478          * {@inheritDoc}
479          */
480         public void add(String name, int value) {
481             mapNameToValue.put(name, new Integer(value));
482             mapValueToName.put(new Integer(value), name);
483         }
484 
485         /**
486          * {@inheritDoc}
487          */
488         public String name(int value) {
489             return (String) mapValueToName.get(new Integer(value));
490         }
491 
492         /**
493          * {@inheritDoc}
494          */
495         public int value(String name) {
496             Object value = mapNameToValue.get(name);
497             if (value == null) {
498                 return -1;
499             }
500             return ((Integer) value).intValue();
501         }
502     }
503 
504     static class HashEntityMap extends MapIntMap {
505         /**
506          * Constructs a new instance of <code>HashEntityMap</code>.
507          */
508         public HashEntityMap() {
509             mapNameToValue = new HashMap();
510             mapValueToName = new HashMap();
511         }
512     }
513 
514     static class TreeEntityMap extends MapIntMap {
515         /**
516          * Constructs a new instance of <code>TreeEntityMap</code>.
517          */
518         public TreeEntityMap() {
519             mapNameToValue = new TreeMap();
520             mapValueToName = new TreeMap();
521         }
522     }
523 
524     static class LookupEntityMap extends PrimitiveEntityMap {
525         private String[] lookupTable;
526 
527         private int LOOKUP_TABLE_SIZE = 256;
528 
529         /**
530          * {@inheritDoc}
531          */
532         public String name(int value) {
533             if (value < LOOKUP_TABLE_SIZE) {
534                 return lookupTable()[value];
535             }
536             return super.name(value);
537         }
538 
539         /**
540          * <p>
541          * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
542          * </p>
543          * 
544          * @return the lookup table
545          */
546         private String[] lookupTable() {
547             if (lookupTable == null) {
548                 createLookupTable();
549             }
550             return lookupTable;
551         }
552 
553         /**
554          * <p>
555          * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
556          * </p>
557          */
558         private void createLookupTable() {
559             lookupTable = new String[LOOKUP_TABLE_SIZE];
560             for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
561                 lookupTable[i] = super.name(i);
562             }
563         }
564     }
565 
566     static class ArrayEntityMap implements EntityMap {
567         protected int growBy = 100;
568 
569         protected int size = 0;
570 
571         protected String[] names;
572 
573         protected int[] values;
574 
575         /**
576          * Constructs a new instance of <code>ArrayEntityMap</code>.
577          */
578         public ArrayEntityMap() {
579             names = new String[growBy];
580             values = new int[growBy];
581         }
582 
583         /**
584          * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
585          * grow.
586          * 
587          * @param growBy
588          *            array will be initialized to and will grow by this amount
589          */
590         public ArrayEntityMap(int growBy) {
591             this.growBy = growBy;
592             names = new String[growBy];
593             values = new int[growBy];
594         }
595 
596         /**
597          * {@inheritDoc}
598          */
599         public void add(String name, int value) {
600             ensureCapacity(size + 1);
601             names[size] = name;
602             values[size] = value;
603             size++;
604         }
605 
606         /**
607          * Verifies the capacity of the entity array, adjusting the size if necessary.
608          * 
609          * @param capacity
610          *            size the array should be
611          */
612         protected void ensureCapacity(int capacity) {
613             if (capacity > names.length) {
614                 int newSize = Math.max(capacity, size + growBy);
615                 String[] newNames = new String[newSize];
616                 System.arraycopy(names, 0, newNames, 0, size);
617                 names = newNames;
618                 int[] newValues = new int[newSize];
619                 System.arraycopy(values, 0, newValues, 0, size);
620                 values = newValues;
621             }
622         }
623 
624         /**
625          * {@inheritDoc}
626          */
627         public String name(int value) {
628             for (int i = 0; i < size; ++i) {
629                 if (values[i] == value) {
630                     return names[i];
631                 }
632             }
633             return null;
634         }
635 
636         /**
637          * {@inheritDoc}
638          */
639         public int value(String name) {
640             for (int i = 0; i < size; ++i) {
641                 if (names[i].equals(name)) {
642                     return values[i];
643                 }
644             }
645             return -1;
646         }
647     }
648 
649     static class BinaryEntityMap extends ArrayEntityMap {
650 
651         /**
652          * Constructs a new instance of <code>BinaryEntityMap</code>.
653          */
654         public BinaryEntityMap() {
655             super();
656         }
657 
658         /**
659          * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
660          * should grow.
661          * 
662          * @param growBy
663          *            array will be initialized to and will grow by this amount
664          */
665         public BinaryEntityMap(int growBy) {
666             super(growBy);
667         }
668 
669         /**
670          * Performs a binary search of the entity array for the specified key. This method is based on code in
671          * {@link java.util.Arrays}.
672          * 
673          * @param key
674          *            the key to be found
675          * @return the index of the entity array matching the specified key
676          */
677         private int binarySearch(int key) {
678             int low = 0;
679             int high = size - 1;
680 
681             while (low <= high) {
682                 int mid = (low + high) >>> 1;
683                 int midVal = values[mid];
684 
685                 if (midVal < key) {
686                     low = mid + 1;
687                 } else if (midVal > key) {
688                     high = mid - 1;
689                 } else {
690                     return mid; // key found
691                 }
692             }
693             return -(low + 1); // key not found.
694         }
695 
696         /**
697          * {@inheritDoc}
698          */
699         public void add(String name, int value) {
700             ensureCapacity(size + 1);
701             int insertAt = binarySearch(value);
702             if (insertAt > 0) {
703                 return; // note: this means you can't insert the same value twice
704             }
705             insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
706             System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
707             values[insertAt] = value;
708             System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
709             names[insertAt] = name;
710             size++;
711         }
712 
713         /**
714          * {@inheritDoc}
715          */
716         public String name(int value) {
717             int index = binarySearch(value);
718             if (index < 0) {
719                 return null;
720             }
721             return names[index];
722         }
723     }
724 
725     // package scoped for testing
726     EntityMap map = new Entities.LookupEntityMap();
727 
728     /**
729      * <p>
730      * Adds entities to this entity.
731      * </p>
732      * 
733      * @param entityArray
734      *            array of entities to be added
735      */
736     public void addEntities(String[][] entityArray) {
737         for (int i = 0; i < entityArray.length; ++i) {
738             addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
739         }
740     }
741 
742     /**
743      * <p>
744      * Add an entity to this entity.
745      * </p>
746      * 
747      * @param name
748      *            name of the entity
749      * @param value
750      *            vale of the entity
751      */
752     public void addEntity(String name, int value) {
753         map.add(name, value);
754     }
755 
756     /**
757      * <p>
758      * Returns the name of the entity identified by the specified value.
759      * </p>
760      * 
761      * @param value
762      *            the value to locate
763      * @return entity name associated with the specified value
764      */
765     public String entityName(int value) {
766         return map.name(value);
767     }
768 
769     /**
770      * <p>
771      * Returns the value of the entity identified by the specified name.
772      * </p>
773      * 
774      * @param name
775      *            the name to locate
776      * @return entity value associated with the specified name
777      */
778     public int entityValue(String name) {
779         return map.value(name);
780     }
781 
782     /**
783      * <p>
784      * Escapes the characters in a <code>String</code>.
785      * </p>
786      * 
787      * <p>
788      * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), escape(&quot;\u00A1&quot;) will return
789      * &quot;&amp;foo;&quot;
790      * </p>
791      * 
792      * @param str
793      *            The <code>String</code> to escape.
794      * @return A new escaped <code>String</code>.
795      */
796     public String escape(String str) {
797         StringWriter stringWriter = createStringWriter(str);
798         try {
799             this.escape(stringWriter, str);
800         } catch (IOException e) {
801             // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
802             // throw IOExceptions.
803             throw new UnhandledException(e);
804         }
805         return stringWriter.toString();
806     }
807 
808     /**
809      * <p>
810      * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
811      * passed.
812      * </p>
813      * 
814      * @param writer
815      *            The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
816      * @param str
817      *            The <code>String</code> to escape. Assumed to be a non-null value.
818      * @throws IOException
819      *             when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
820      *             methods.
821      * 
822      * @see #escape(String)
823      * @see Writer
824      */
825     public void escape(Writer writer, String str) throws IOException {
826         int len = str.length();
827         for (int i = 0; i < len; i++) {
828             char c = str.charAt(i);
829             String entityName = this.entityName(c);
830             if (entityName == null) {
831                 if (c > 0x7F) {
832                     writer.write("&#");
833                     writer.write(Integer.toString(c, 10));
834                     writer.write(';');
835                 } else {
836                     writer.write(c);
837                 }
838             } else {
839                 writer.write('&');
840                 writer.write(entityName);
841                 writer.write(';');
842             }
843         }
844     }
845 
846     /**
847      * <p>
848      * Unescapes the entities in a <code>String</code>.
849      * </p>
850      * 
851      * <p>
852      * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), unescape(&quot;&amp;foo;&quot;) will return
853      * &quot;\u00A1&quot;
854      * </p>
855      * 
856      * @param str
857      *            The <code>String</code> to escape.
858      * @return A new escaped <code>String</code>.
859      */
860     public String unescape(String str) {
861         int firstAmp = str.indexOf('&');
862         if (firstAmp < 0) {
863             return str;
864         } else {
865             StringWriter stringWriter = createStringWriter(str);
866             try {
867                 this.doUnescape(stringWriter, str, firstAmp);
868             } catch (IOException e) {
869                 // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) 
870                 // do not throw IOExceptions.
871                 throw new UnhandledException(e);
872             }
873             return stringWriter.toString();
874         }
875     }
876 
877     /**
878      * Make the StringWriter 10% larger than the source String to avoid growing the writer
879      *
880      * @param str The source string
881      * @return A newly created StringWriter
882      */
883     private StringWriter createStringWriter(String str) {
884         return new StringWriter((int) (str.length() + (str.length() * 0.1)));
885     }
886 
887     /**
888      * <p>
889      * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
890      * <code>Writer</code> passed.
891      * </p>
892      * 
893      * @param writer
894      *            The <code>Writer</code> to write the results to; assumed to be non-null.
895      * @param str
896      *            The source <code>String</code> to unescape; assumed to be non-null.
897      * @throws IOException
898      *             when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
899      *             methods.
900      * 
901      * @see #escape(String)
902      * @see Writer
903      */
904     public void unescape(Writer writer, String str) throws IOException {
905         int firstAmp = str.indexOf('&');
906         if (firstAmp < 0) {
907             writer.write(str);
908             return;
909         } else {
910             doUnescape(writer, str, firstAmp);
911         }
912     }
913 
914     /**
915      * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
916      *
917      * @param writer
918      *            The <code>Writer</code> to write the results to; assumed to be non-null.
919      * @param str
920      *            The source <code>String</code> to unescape; assumed to be non-null.
921      * @param firstAmp
922      *            The <code>int</code> index of the first ampersand in the source String.
923      * @throws IOException
924      *             when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
925      *             methods.
926      */
927     private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
928         writer.write(str, 0, firstAmp);
929         int len = str.length();
930         for (int i = firstAmp; i < len; i++) {
931             char c = str.charAt(i);
932             if (c == '&') {
933                 int nextIdx = i + 1;
934                 int semiColonIdx = str.indexOf(';', nextIdx);
935                 if (semiColonIdx == -1) {
936                     writer.write(c);
937                     continue;
938                 }
939                 int amphersandIdx = str.indexOf('&', i + 1);
940                 if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
941                     // Then the text looks like &...&...;
942                     writer.write(c);
943                     continue;
944                 }
945                 String entityContent = str.substring(nextIdx, semiColonIdx);
946                 int entityValue = -1;
947                 int entityContentLen = entityContent.length();
948                 if (entityContentLen > 0) {
949                     if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
950                         // hexidecimal)
951                         if (entityContentLen > 1) {
952                             char isHexChar = entityContent.charAt(1);
953                             try {
954                                 switch (isHexChar) {
955                                     case 'X' :
956                                     case 'x' : {
957                                         entityValue = Integer.parseInt(entityContent.substring(2), 16);
958                                         break;
959                                     }
960                                     default : {
961                                         entityValue = Integer.parseInt(entityContent.substring(1), 10);
962                                     }
963                                 }
964                                 if (entityValue > 0xFFFF) {
965                                     entityValue = -1;
966                                 }
967                             } catch (NumberFormatException e) {
968                                 entityValue = -1;
969                             }
970                         }
971                     } else { // escaped value content is an entity name
972                         entityValue = this.entityValue(entityContent);
973                     }
974                 }
975 
976                 if (entityValue == -1) {
977                     writer.write('&');
978                     writer.write(entityContent);
979                     writer.write(';');
980                 } else {
981                     writer.write(entityValue);
982                 }
983                 i = semiColonIdx; // move index up to the semi-colon
984             } else {
985                 writer.write(c);
986             }
987         }
988     }
989 
990 }