View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */ 
17  package org.apache.commons.betwixt;
18   /**
19    * <p><code>XMLUtils</code> contains basic utility methods for XML.</p>
20    * 
21    * <p>The code for {@link #isWellFormedXMLName} is based on code in 
22    * <code>org.apache.xerces.util.XMLChar</code> 
23    * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
24    * The authors of this class are credited below.</p>
25    *
26    * @author Glenn Marcy, IBM
27    * @author Andy Clark, IBM
28    * @author Eric Ye, IBM
29    * @author Arnaud  Le Hors, IBM
30    * @author Rahul Srivastava, Sun Microsystems Inc.  
31    *
32    * @author Robert Burrell Donkin
33    * @since 0.5
34    */
35  public class XMLUtils {
36  
37      // Constants
38      //-------------------------------------------------------------------------   
39  
40      /** Escaped <code>&lt;</code> entity */
41      public static final String LESS_THAN_ENTITY = "&lt;";
42      /** Escaped <code>&gt;</code> entity */
43      public static final String GREATER_THAN_ENTITY = "&gt;";
44      /** Escaped <code>&amp;</code> entity */
45      public static final String AMPERSAND_ENTITY = "&amp;";
46      /** Escaped <code>'</code> entity */
47      public static final String APOSTROPHE_ENTITY = "&apos;";
48      /** Escaped <code>"</code> entity */
49      public static final String QUOTE_ENTITY = "&quot;";
50  
51      // Used by isWellFormedXMLName
52      /** Name start character mask. */
53      private static final int MASK_NAME_START = 0x01;
54      /** Name character mask. */
55      private static final int MASK_NAME = 0x02;
56      
57      // Class attributes
58      //-------------------------------------------------------------------------   
59      
60      /** Character flags. */
61      private static final byte[] CHARS = new byte[1 << 16];
62  
63      //
64      // Static initialization
65      //
66  
67      static {
68  
69          //
70          // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
71          //                  CombiningChar | Extender
72          //
73  
74          int nameChar[] = { 
75              0x002D, 0x002E, // '-' and '.'
76          };
77  
78          //
79          // [5] Name ::= (Letter | '_' | ':') (NameChar)*
80          //
81  
82          int nameStartChar[] = { 
83              0x003A, 0x005F, // ':' and '_'
84          };
85  
86          //
87          // [84] Letter ::= BaseChar | Ideographic
88          //
89  
90          int letterRange[] = {
91              // BaseChar
92              0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
93              0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
94              0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
95              0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
96              0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
97              0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
98              0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
99              0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
100             0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
101             0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
102             0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
103             0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
104             0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
105             0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
106             0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
107             0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
108             0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
109             0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
110             0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
111             0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
112             0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
113             0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
114             0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
115             0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
116             0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
117             0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
118             0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
119             0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
120             0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
121             0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
122             0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
123             0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
124             0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
125             0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
126             0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
127             0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
128             0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
129             0xAC00, 0xD7A3,
130             // Ideographic
131             0x3021, 0x3029, 0x4E00, 0x9FA5,
132         };
133         int letterChar[] = {
134             // BaseChar
135             0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
136             0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
137             0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
138             0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
139             0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
140             0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
141             0x1F5D, 0x1FBE, 0x2126, 0x212E,
142             // Ideographic
143             0x3007,
144         };
145 
146         //
147         // [87] CombiningChar ::= ...
148         //
149 
150         int combiningCharRange[] = {
151             0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
152             0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
153             0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
154             0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
155             0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
156             0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
157             0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
158             0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
159             0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
160             0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
161             0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
162             0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
163             0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
164             0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
165             0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
166             0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
167             0x20D0, 0x20DC, 0x302A, 0x302F,
168         };
169 
170         int combiningCharChar[] = {
171             0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
172             0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
173             0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
174             0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
175         };
176 
177         //
178         // [88] Digit ::= ...
179         //
180 
181         int digitRange[] = {
182             0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
183             0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
184             0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
185             0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
186         };
187 
188         //
189         // [89] Extender ::= ...
190         //
191 
192         int extenderRange[] = {
193             0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
194         };
195 
196         int extenderChar[] = {
197             0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
198         };
199 
200         //
201         // Initialize
202         //
203 
204         // set name start characters
205         for (int i = 0; i < nameStartChar.length; i++) {
206             CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME;
207         }
208         for (int i = 0; i < letterRange.length; i += 2) {
209             for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
210                 CHARS[j] |= MASK_NAME_START | MASK_NAME;
211             }
212         }
213         for (int i = 0; i < letterChar.length; i++) {
214             CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME;
215         }
216 
217         // set name characters
218         for (int i = 0; i < nameChar.length; i++) {
219             CHARS[nameChar[i]] |= MASK_NAME;
220         }
221         for (int i = 0; i < digitRange.length; i += 2) {
222             for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
223                 CHARS[j] |= MASK_NAME;
224             }
225         }
226         for (int i = 0; i < combiningCharRange.length; i += 2) {
227             for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
228                 CHARS[j] |= MASK_NAME;
229             }
230         }
231         for (int i = 0; i < combiningCharChar.length; i++) {
232             CHARS[combiningCharChar[i]] |= MASK_NAME;
233         }
234         for (int i = 0; i < extenderRange.length; i += 2) {
235             for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
236                 CHARS[j] |= MASK_NAME;
237             }
238         }
239         for (int i = 0; i < extenderChar.length; i++) {
240             CHARS[extenderChar[i]] |= MASK_NAME;
241         }
242 
243     }
244         
245     // Constructor
246     //-------------------------------------------------------------------------   
247 
248     /** 
249      * <p>Constructor for use by tools that required <code>JavaBean</code> instances.</p>
250      * 
251      * <p>This constructor is public <strong>only</strong> 
252      * to permit tools that require a JavaBean instance to operate.
253      * <code>XMLUtils</code> instances should <strong>not</strong> be constructed in standard 
254      * programming. Instead, the class methods should be called directly.</p>
255      */
256     public XMLUtils() {}
257 
258     // Class methods
259     //-------------------------------------------------------------------------  
260     
261     /** 
262      * <p>Escape the <code>toString</code> of the given object.
263      * For use as body text.</p>
264      *
265      * @param value escape <code>value.toString()</code>
266      * @return text with escaped delimiters 
267      */
268     public static final String escapeBodyValue(Object value) {
269         StringBuffer buffer = new StringBuffer(value.toString());
270         for (int i=0, size = buffer.length(); i <size; i++) {
271             switch (buffer.charAt(i)) {
272                 case '<':
273                     buffer.replace(i, i+1, LESS_THAN_ENTITY);
274                     size += 3;
275                     i+=3;
276                     break;
277                  case '>':
278                     buffer.replace(i, i+1, GREATER_THAN_ENTITY);
279                     size += 3;
280                     i += 3;
281                     break;
282                  case '&':
283                     buffer.replace(i, i+1, AMPERSAND_ENTITY);
284                     size += 4;
285                     i += 4;
286                     break;        
287             }
288         }
289         return buffer.toString();
290     }
291 
292     /** 
293      * <p>Escape the <code>toString</code> of the given object.
294      * For use in an attribute value.</p>
295      *
296      * @param value escape <code>value.toString()</code>
297      * @return text with characters restricted (for use in attributes) escaped
298      */
299     public static final String escapeAttributeValue(Object value) {
300         StringBuffer buffer = new StringBuffer(value.toString());
301         for (int i=0, size = buffer.length(); i <size; i++) {
302             switch (buffer.charAt(i)) {
303                 case '<':
304                     buffer.replace(i, i+1, LESS_THAN_ENTITY);
305                     size += 3;
306                     i+=3;
307                     break;
308                  case '>':
309                     buffer.replace(i, i+1, GREATER_THAN_ENTITY);
310                     size += 3;
311                     i += 3;
312                     break;
313                  case '&':
314                     buffer.replace(i, i+1, AMPERSAND_ENTITY);
315                     size += 4;
316                     i += 4;
317                     break;
318                  case '\'':
319                     buffer.replace(i, i+1, APOSTROPHE_ENTITY);
320                     size += 5;
321                     i += 5;
322                     break;
323                  case '\"':
324                     buffer.replace(i, i+1, QUOTE_ENTITY);
325                     size += 5;
326                     i += 5;
327                     break;           
328             }
329         }
330         return buffer.toString();
331     }    
332     
333     
334     /**
335      * Escapes the given content suitable for insertion within a
336      * <code>CDATA</code> sequence.
337      * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
338      * string ']]>' is recognized as markup.
339      * @param content the body content whose character data should 
340      * be escaped in a way appropriate for use within a <code>CDATA</code>
341      * section of xml.
342      * @return escaped character data, not null
343      */
344     public static final String escapeCDATAContent(String content) {
345         StringBuffer buffer = new StringBuffer(content);
346         escapeCDATAContent(buffer);
347         return buffer.toString();
348     }
349      
350     /**
351      * Escapes the given content suitable for insertion within a
352      * <code>CDATA</code> sequence.
353      * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
354      * string ']]>' is recognized as markup.
355      * @param bufferedContent the body content within a buffer 
356      * whose character data should 
357      * be escaped in a way appropriate for use within a <code>CDATA</code>
358      * section of xml
359      */
360     public static final void escapeCDATAContent(StringBuffer bufferedContent) {
361         for (int i=2, size = bufferedContent.length(); i<size; i++) {
362             char at = bufferedContent.charAt(i);
363             if ( at == '>' 
364                 && bufferedContent.charAt(i-1) == ']' 
365                 && bufferedContent.charAt(i-2) == ']') {
366                     
367                     bufferedContent.replace(i, i+1, GREATER_THAN_ENTITY);
368                 size += 3;
369                 i+=3;
370             }
371         }
372     }    
373  
374     
375     /**
376      * <p>Is this string a well formed xml name?</p>
377      *
378      * <p>Only certain characters are allowed in well formed element and attribute
379      * names in xml. For example, white space is not allowed in a name.</p>
380      *
381      * <p>The code for this method is based on code in 
382      * <code>org.apache.xerces.util.XMLChar</code> 
383      * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
384      * The authors of this class are credited at the top of this class.</p>
385      *
386      * @param name the <code>String</code> to be checked for use as an xml attribute 
387      * or element name. Returns false if <code>name</code> is null
388      * @return true if this string would be a well-formed name
389      */
390     public static boolean isWellFormedXMLName( String name ) {
391         if ( name == null ) {
392             return false;
393         }
394         
395         if ( name.length() == 0 ) {
396             return false;
397         }
398         
399         char ch = name.charAt(0);
400         if( isNameStartChar(ch) == false) {
401            return false;
402            
403         }
404         
405         for (int i = 1; i < name.length(); i++ ) {
406            ch = name.charAt(i);
407            if( isNameChar( ch ) == false ) {
408               return false;
409            }
410         }
411         return true;
412     }
413 
414     /**
415      * Returns true if the specified character is a valid name
416      * character as defined by the XML 1.0 specification.
417      *
418      * @param c The character to check.
419      * @return true if this is an XML name character
420      */
421     public static boolean isNameChar(int c) {
422         return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
423     }
424     
425     /**
426      * Returns true if the specified character is a valid name start
427      * character as defined in the XML 1.0 specification.
428      *
429      * @param c The character to check.
430      * @return trus if this is an XML name start character
431      */
432     public static boolean isNameStartChar(int c) {
433         return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
434     }
435 }