001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.io.input; 018 019 import java.io.BufferedInputStream; 020 import java.io.BufferedReader; 021 import java.io.File; 022 import java.io.FileInputStream; 023 import java.io.IOException; 024 import java.io.InputStream; 025 import java.io.InputStreamReader; 026 import java.io.Reader; 027 import java.io.StringReader; 028 import java.net.HttpURLConnection; 029 import java.net.URL; 030 import java.net.URLConnection; 031 import java.text.MessageFormat; 032 import java.util.regex.Matcher; 033 import java.util.regex.Pattern; 034 035 import org.apache.commons.io.ByteOrderMark; 036 037 /** 038 * Character stream that handles all the necessary Voodo to figure out the 039 * charset encoding of the XML document within the stream. 040 * <p> 041 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 042 * This one IS a character stream. 043 * <p> 044 * All this has to be done without consuming characters from the stream, if not 045 * the XML parser will not recognized the document as a valid XML. This is not 046 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 047 * right now, XmlStreamReader handles it and things work in all parsers). 048 * <p> 049 * The XmlStreamReader class handles the charset encoding of XML documents in 050 * Files, raw streams and HTTP streams by offering a wide set of constructors. 051 * <p> 052 * By default the charset encoding detection is lenient, the constructor with 053 * the lenient flag can be used for an script (following HTTP MIME and XML 054 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 055 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 056 * Determining the character encoding of a feed</a>. 057 * <p> 058 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 059 * Apache License 2.0. 060 * 061 * @author Alejandro Abdelnur 062 * @version $Id: XmlStreamReader.java 1021884 2010-10-12 18:49:16Z ggregory $ 063 * @see org.apache.commons.io.output.XmlStreamWriter 064 * @since Commons IO 2.0 065 */ 066 public class XmlStreamReader extends Reader { 067 private static final int BUFFER_SIZE = 4096; 068 069 private static final String UTF_8 = "UTF-8"; 070 071 private static final String US_ASCII = "US-ASCII"; 072 073 private static final String UTF_16BE = "UTF-16BE"; 074 075 private static final String UTF_16LE = "UTF-16LE"; 076 077 private static final String UTF_16 = "UTF-16"; 078 079 private static final String EBCDIC = "CP1047"; 080 081 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] { 082 ByteOrderMark.UTF_8, 083 ByteOrderMark.UTF_16BE, 084 ByteOrderMark.UTF_16LE 085 }; 086 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] { 087 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 088 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), 089 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 090 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) 091 }; 092 093 094 private final Reader reader; 095 096 private final String encoding; 097 098 private final String defaultEncoding; 099 100 /** 101 * Returns the default encoding to use if none is set in HTTP content-type, 102 * XML prolog and the rules based on content-type are not adequate. 103 * <p> 104 * If it is NULL the content-type based rules are used. 105 * 106 * @return the default encoding to use. 107 */ 108 public String getDefaultEncoding() { 109 return defaultEncoding; 110 } 111 112 /** 113 * Creates a Reader for a File. 114 * <p> 115 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 116 * if this is also missing defaults to UTF-8. 117 * <p> 118 * It does a lenient charset encoding detection, check the constructor with 119 * the lenient parameter for details. 120 * 121 * @param file File to create a Reader from. 122 * @throws IOException thrown if there is a problem reading the file. 123 */ 124 public XmlStreamReader(File file) throws IOException { 125 this(new FileInputStream(file)); 126 } 127 128 /** 129 * Creates a Reader for a raw InputStream. 130 * <p> 131 * It follows the same logic used for files. 132 * <p> 133 * It does a lenient charset encoding detection, check the constructor with 134 * the lenient parameter for details. 135 * 136 * @param is InputStream to create a Reader from. 137 * @throws IOException thrown if there is a problem reading the stream. 138 */ 139 public XmlStreamReader(InputStream is) throws IOException { 140 this(is, true); 141 } 142 143 /** 144 * Creates a Reader for a raw InputStream. 145 * <p> 146 * It follows the same logic used for files. 147 * <p> 148 * If lenient detection is indicated and the detection above fails as per 149 * specifications it then attempts the following: 150 * <p> 151 * If the content type was 'text/html' it replaces it with 'text/xml' and 152 * tries the detection again. 153 * <p> 154 * Else if the XML prolog had a charset encoding that encoding is used. 155 * <p> 156 * Else if the content type had a charset encoding that encoding is used. 157 * <p> 158 * Else 'UTF-8' is used. 159 * <p> 160 * If lenient detection is indicated an XmlStreamReaderException is never 161 * thrown. 162 * 163 * @param is InputStream to create a Reader from. 164 * @param lenient indicates if the charset encoding detection should be 165 * relaxed. 166 * @throws IOException thrown if there is a problem reading the stream. 167 * @throws XmlStreamReaderException thrown if the charset encoding could not 168 * be determined according to the specs. 169 */ 170 public XmlStreamReader(InputStream is, boolean lenient) throws IOException { 171 this(is, lenient, null); 172 } 173 174 /** 175 * Creates a Reader for a raw InputStream. 176 * <p> 177 * It follows the same logic used for files. 178 * <p> 179 * If lenient detection is indicated and the detection above fails as per 180 * specifications it then attempts the following: 181 * <p> 182 * If the content type was 'text/html' it replaces it with 'text/xml' and 183 * tries the detection again. 184 * <p> 185 * Else if the XML prolog had a charset encoding that encoding is used. 186 * <p> 187 * Else if the content type had a charset encoding that encoding is used. 188 * <p> 189 * Else 'UTF-8' is used. 190 * <p> 191 * If lenient detection is indicated an XmlStreamReaderException is never 192 * thrown. 193 * 194 * @param is InputStream to create a Reader from. 195 * @param lenient indicates if the charset encoding detection should be 196 * relaxed. 197 * @param defaultEncoding The default encoding 198 * @throws IOException thrown if there is a problem reading the stream. 199 * @throws XmlStreamReaderException thrown if the charset encoding could not 200 * be determined according to the specs. 201 */ 202 public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException { 203 this.defaultEncoding = defaultEncoding; 204 this.encoding = doRawStream(is, lenient); 205 this.reader = new InputStreamReader(is, encoding); 206 } 207 208 /** 209 * Creates a Reader using the InputStream of a URL. 210 * <p> 211 * If the URL is not of type HTTP and there is not 'content-type' header in 212 * the fetched data it uses the same logic used for Files. 213 * <p> 214 * If the URL is a HTTP Url or there is a 'content-type' header in the 215 * fetched data it uses the same logic used for an InputStream with 216 * content-type. 217 * <p> 218 * It does a lenient charset encoding detection, check the constructor with 219 * the lenient parameter for details. 220 * 221 * @param url URL to create a Reader from. 222 * @throws IOException thrown if there is a problem reading the stream of 223 * the URL. 224 */ 225 public XmlStreamReader(URL url) throws IOException { 226 this(url.openConnection(), null); 227 } 228 229 /** 230 * Creates a Reader using the InputStream of a URLConnection. 231 * <p> 232 * If the URLConnection is not of type HttpURLConnection and there is not 233 * 'content-type' header in the fetched data it uses the same logic used for 234 * files. 235 * <p> 236 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 237 * the fetched data it uses the same logic used for an InputStream with 238 * content-type. 239 * <p> 240 * It does a lenient charset encoding detection, check the constructor with 241 * the lenient parameter for details. 242 * 243 * @param conn URLConnection to create a Reader from. 244 * @param defaultEncoding The default encoding 245 * @throws IOException thrown if there is a problem reading the stream of 246 * the URLConnection. 247 */ 248 public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException { 249 this.defaultEncoding = defaultEncoding; 250 boolean lenient = true; 251 String contentType = conn.getContentType(); 252 InputStream is = conn.getInputStream(); 253 if (conn instanceof HttpURLConnection || contentType != null) { 254 this.encoding = doHttpStream(is, contentType, lenient); 255 } else { 256 this.encoding = doRawStream(is, lenient); 257 } 258 this.reader = new InputStreamReader(is, encoding); 259 } 260 261 /** 262 * Creates a Reader using an InputStream an the associated content-type 263 * header. 264 * <p> 265 * First it checks if the stream has BOM. If there is not BOM checks the 266 * content-type encoding. If there is not content-type encoding checks the 267 * XML prolog encoding. If there is not XML prolog encoding uses the default 268 * encoding mandated by the content-type MIME type. 269 * <p> 270 * It does a lenient charset encoding detection, check the constructor with 271 * the lenient parameter for details. 272 * 273 * @param is InputStream to create the reader from. 274 * @param httpContentType content-type header to use for the resolution of 275 * the charset encoding. 276 * @throws IOException thrown if there is a problem reading the file. 277 */ 278 public XmlStreamReader(InputStream is, String httpContentType) 279 throws IOException { 280 this(is, httpContentType, true); 281 } 282 283 /** 284 * Creates a Reader using an InputStream an the associated content-type 285 * header. This constructor is lenient regarding the encoding detection. 286 * <p> 287 * First it checks if the stream has BOM. If there is not BOM checks the 288 * content-type encoding. If there is not content-type encoding checks the 289 * XML prolog encoding. If there is not XML prolog encoding uses the default 290 * encoding mandated by the content-type MIME type. 291 * <p> 292 * If lenient detection is indicated and the detection above fails as per 293 * specifications it then attempts the following: 294 * <p> 295 * If the content type was 'text/html' it replaces it with 'text/xml' and 296 * tries the detection again. 297 * <p> 298 * Else if the XML prolog had a charset encoding that encoding is used. 299 * <p> 300 * Else if the content type had a charset encoding that encoding is used. 301 * <p> 302 * Else 'UTF-8' is used. 303 * <p> 304 * If lenient detection is indicated an XmlStreamReaderException is never 305 * thrown. 306 * 307 * @param is InputStream to create the reader from. 308 * @param httpContentType content-type header to use for the resolution of 309 * the charset encoding. 310 * @param lenient indicates if the charset encoding detection should be 311 * relaxed. 312 * @param defaultEncoding The default encoding 313 * @throws IOException thrown if there is a problem reading the file. 314 * @throws XmlStreamReaderException thrown if the charset encoding could not 315 * be determined according to the specs. 316 */ 317 public XmlStreamReader(InputStream is, String httpContentType, 318 boolean lenient, String defaultEncoding) throws IOException { 319 this.defaultEncoding = defaultEncoding; 320 this.encoding = doHttpStream(is, httpContentType, lenient); 321 this.reader = new InputStreamReader(is, encoding); 322 } 323 324 /** 325 * Creates a Reader using an InputStream an the associated content-type 326 * header. This constructor is lenient regarding the encoding detection. 327 * <p> 328 * First it checks if the stream has BOM. If there is not BOM checks the 329 * content-type encoding. If there is not content-type encoding checks the 330 * XML prolog encoding. If there is not XML prolog encoding uses the default 331 * encoding mandated by the content-type MIME type. 332 * <p> 333 * If lenient detection is indicated and the detection above fails as per 334 * specifications it then attempts the following: 335 * <p> 336 * If the content type was 'text/html' it replaces it with 'text/xml' and 337 * tries the detection again. 338 * <p> 339 * Else if the XML prolog had a charset encoding that encoding is used. 340 * <p> 341 * Else if the content type had a charset encoding that encoding is used. 342 * <p> 343 * Else 'UTF-8' is used. 344 * <p> 345 * If lenient detection is indicated an XmlStreamReaderException is never 346 * thrown. 347 * 348 * @param is InputStream to create the reader from. 349 * @param httpContentType content-type header to use for the resolution of 350 * the charset encoding. 351 * @param lenient indicates if the charset encoding detection should be 352 * relaxed. 353 * @throws IOException thrown if there is a problem reading the file. 354 * @throws XmlStreamReaderException thrown if the charset encoding could not 355 * be determined according to the specs. 356 */ 357 public XmlStreamReader(InputStream is, String httpContentType, 358 boolean lenient) throws IOException { 359 this(is, httpContentType, lenient, null); 360 } 361 362 /** 363 * Returns the charset encoding of the XmlStreamReader. 364 * 365 * @return charset encoding. 366 */ 367 public String getEncoding() { 368 return encoding; 369 } 370 371 /** 372 * Invokes the underlying reader's <code>read(char[], int, int)</code> method. 373 * @param buf the buffer to read the characters into 374 * @param offset The start offset 375 * @param len The number of bytes to read 376 * @return the number of characters read or -1 if the end of stream 377 * @throws IOException if an I/O error occurs 378 */ 379 @Override 380 public int read(char[] buf, int offset, int len) throws IOException { 381 return reader.read(buf, offset, len); 382 } 383 384 /** 385 * Closes the XmlStreamReader stream. 386 * 387 * @throws IOException thrown if there was a problem closing the stream. 388 */ 389 @Override 390 public void close() throws IOException { 391 reader.close(); 392 } 393 394 /** 395 * Process the raw stream. 396 * 397 * @param is InputStream to create the reader from. 398 * @param lenient indicates if the charset encoding detection should be 399 * relaxed. 400 * @return the encoding to be used 401 * @throws IOException thrown if there is a problem reading the stream. 402 */ 403 private String doRawStream(InputStream is, boolean lenient) 404 throws IOException { 405 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 406 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 407 String bomEnc = bom.getBOMCharsetName(); 408 String xmlGuessEnc = pis.getBOMCharsetName(); 409 String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 410 try { 411 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 412 } catch (XmlStreamReaderException ex) { 413 if (lenient) { 414 return doLenientDetection(null, is, ex); 415 } else { 416 throw ex; 417 } 418 } 419 } 420 421 /** 422 * Process a HTTP stream. 423 * 424 * @param is InputStream to create the reader from. 425 * @param httpContentType The HTTP content type 426 * @param lenient indicates if the charset encoding detection should be 427 * relaxed. 428 * @return the encoding to be used 429 * @throws IOException thrown if there is a problem reading the stream. 430 */ 431 private String doHttpStream(InputStream is, String httpContentType, 432 boolean lenient) throws IOException { 433 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 434 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 435 String bomEnc = bom.getBOMCharsetName(); 436 String xmlGuessEnc = pis.getBOMCharsetName(); 437 String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 438 try { 439 return calculateHttpEncoding(httpContentType, bomEnc, 440 xmlGuessEnc, xmlEnc, lenient); 441 } catch (XmlStreamReaderException ex) { 442 if (lenient) { 443 return doLenientDetection(httpContentType, is, ex); 444 } else { 445 throw ex; 446 } 447 } 448 } 449 450 /** 451 * Do lenient detection. 452 * 453 * @param httpContentType content-type header to use for the resolution of 454 * the charset encoding. 455 * @param is the unconsumed InputStream 456 * @param ex The thrown exception 457 * @return the encoding 458 * @throws IOException thrown if there is a problem reading the stream. 459 */ 460 private String doLenientDetection(String httpContentType, InputStream is, 461 XmlStreamReaderException ex) throws IOException { 462 if (httpContentType != null && httpContentType.startsWith("text/html")) { 463 httpContentType = httpContentType.substring("text/html".length()); 464 httpContentType = "text/xml" + httpContentType; 465 try { 466 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), 467 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 468 } catch (XmlStreamReaderException ex2) { 469 ex = ex2; 470 } 471 } 472 String encoding = ex.getXmlEncoding(); 473 if (encoding == null) { 474 encoding = ex.getContentTypeEncoding(); 475 } 476 if (encoding == null) { 477 encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding; 478 } 479 return encoding; 480 } 481 482 /** 483 * Calculate the raw encoding. 484 * 485 * @param bomEnc BOM encoding 486 * @param xmlGuessEnc XML Guess encoding 487 * @param xmlEnc XML encoding 488 * @return the raw encoding 489 * @throws IOException thrown if there is a problem reading the stream. 490 */ 491 String calculateRawEncoding(String bomEnc, String xmlGuessEnc, 492 String xmlEnc) throws IOException { 493 494 // BOM is Null 495 if (bomEnc == null) { 496 if (xmlGuessEnc == null || xmlEnc == null) { 497 return (defaultEncoding == null ? UTF_8 : defaultEncoding); 498 } 499 if (xmlEnc.equals(UTF_16) && 500 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 501 return xmlGuessEnc; 502 } 503 return xmlEnc; 504 } 505 506 // BOM is UTF-8 507 if (bomEnc.equals(UTF_8)) { 508 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 509 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 510 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 511 } 512 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 513 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 514 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 515 } 516 return bomEnc; 517 } 518 519 // BOM is UTF-16BE or UTF-16LE 520 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 521 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 522 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 523 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 524 } 525 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 526 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 527 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 528 } 529 return bomEnc; 530 } 531 532 // BOM is something else 533 String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 534 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 535 } 536 537 538 /** 539 * Calculate the HTTP encoding. 540 * 541 * @param httpContentType The HTTP content type 542 * @param bomEnc BOM encoding 543 * @param xmlGuessEnc XML Guess encoding 544 * @param xmlEnc XML encoding 545 * @param lenient indicates if the charset encoding detection should be 546 * relaxed. 547 * @return the HTTP encoding 548 * @throws IOException thrown if there is a problem reading the stream. 549 */ 550 String calculateHttpEncoding(String httpContentType, 551 String bomEnc, String xmlGuessEnc, String xmlEnc, 552 boolean lenient) throws IOException { 553 554 // Lenient and has XML encoding 555 if (lenient && xmlEnc != null) { 556 return xmlEnc; 557 } 558 559 // Determine mime/encoding content types from HTTP Content Type 560 String cTMime = getContentTypeMime(httpContentType); 561 String cTEnc = getContentTypeEncoding(httpContentType); 562 boolean appXml = isAppXml(cTMime); 563 boolean textXml = isTextXml(cTMime); 564 565 // Mime type NOT "application/xml" or "text/xml" 566 if (!appXml && !textXml) { 567 String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 568 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 569 } 570 571 // No content type encoding 572 if (cTEnc == null) { 573 if (appXml) { 574 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 575 } else { 576 return (defaultEncoding == null) ? US_ASCII : defaultEncoding; 577 } 578 } 579 580 // UTF-16BE or UTF-16LE content type encoding 581 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 582 if (bomEnc != null) { 583 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 584 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 585 } 586 return cTEnc; 587 } 588 589 // UTF-16 content type encoding 590 if (cTEnc.equals(UTF_16)) { 591 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 592 return bomEnc; 593 } 594 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 595 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 596 } 597 598 return cTEnc; 599 } 600 601 /** 602 * Returns MIME type or NULL if httpContentType is NULL. 603 * 604 * @param httpContentType the HTTP content type 605 * @return The mime content type 606 */ 607 static String getContentTypeMime(String httpContentType) { 608 String mime = null; 609 if (httpContentType != null) { 610 int i = httpContentType.indexOf(";"); 611 if (i >= 0) { 612 mime = httpContentType.substring(0, i); 613 } else { 614 mime = httpContentType; 615 } 616 mime = mime.trim(); 617 } 618 return mime; 619 } 620 621 private static final Pattern CHARSET_PATTERN = Pattern 622 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 623 624 /** 625 * Returns charset parameter value, NULL if not present, NULL if 626 * httpContentType is NULL. 627 * 628 * @param httpContentType the HTTP content type 629 * @return The content type encoding 630 */ 631 static String getContentTypeEncoding(String httpContentType) { 632 String encoding = null; 633 if (httpContentType != null) { 634 int i = httpContentType.indexOf(";"); 635 if (i > -1) { 636 String postMime = httpContentType.substring(i + 1); 637 Matcher m = CHARSET_PATTERN.matcher(postMime); 638 encoding = (m.find()) ? m.group(1) : null; 639 encoding = (encoding != null) ? encoding.toUpperCase() : null; 640 } 641 } 642 return encoding; 643 } 644 645 public static final Pattern ENCODING_PATTERN = Pattern.compile( 646 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 647 Pattern.MULTILINE); 648 649 /** 650 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. 651 * 652 * @param is InputStream to create the reader from. 653 * @param guessedEnc guessed encoding 654 * @return the encoding declared in the <?xml encoding=...?> 655 * @throws IOException thrown if there is a problem reading the stream. 656 */ 657 private static String getXmlProlog(InputStream is, String guessedEnc) 658 throws IOException { 659 String encoding = null; 660 if (guessedEnc != null) { 661 byte[] bytes = new byte[BUFFER_SIZE]; 662 is.mark(BUFFER_SIZE); 663 int offset = 0; 664 int max = BUFFER_SIZE; 665 int c = is.read(bytes, offset, max); 666 int firstGT = -1; 667 String xmlProlog = null; 668 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { 669 offset += c; 670 max -= c; 671 c = is.read(bytes, offset, max); 672 xmlProlog = new String(bytes, 0, offset, guessedEnc); 673 firstGT = xmlProlog.indexOf('>'); 674 } 675 if (firstGT == -1) { 676 if (c == -1) { 677 throw new IOException("Unexpected end of XML stream"); 678 } else { 679 throw new IOException( 680 "XML prolog or ROOT element not found on first " 681 + offset + " bytes"); 682 } 683 } 684 int bytesRead = offset; 685 if (bytesRead > 0) { 686 is.reset(); 687 BufferedReader bReader = new BufferedReader(new StringReader( 688 xmlProlog.substring(0, firstGT + 1))); 689 StringBuffer prolog = new StringBuffer(); 690 String line = bReader.readLine(); 691 while (line != null) { 692 prolog.append(line); 693 line = bReader.readLine(); 694 } 695 Matcher m = ENCODING_PATTERN.matcher(prolog); 696 if (m.find()) { 697 encoding = m.group(1).toUpperCase(); 698 encoding = encoding.substring(1, encoding.length() - 1); 699 } 700 } 701 } 702 return encoding; 703 } 704 705 /** 706 * Indicates if the MIME type belongs to the APPLICATION XML family. 707 * 708 * @param mime The mime type 709 * @return true if the mime type belongs to the APPLICATION XML family, 710 * otherwise false 711 */ 712 static boolean isAppXml(String mime) { 713 return mime != null && 714 (mime.equals("application/xml") || 715 mime.equals("application/xml-dtd") || 716 mime.equals("application/xml-external-parsed-entity") || 717 (mime.startsWith("application/") && mime.endsWith("+xml"))); 718 } 719 720 /** 721 * Indicates if the MIME type belongs to the TEXT XML family. 722 * 723 * @param mime The mime type 724 * @return true if the mime type belongs to the TEXT XML family, 725 * otherwise false 726 */ 727 static boolean isTextXml(String mime) { 728 return mime != null && 729 (mime.equals("text/xml") || 730 mime.equals("text/xml-external-parsed-entity") || 731 (mime.startsWith("text/") && mime.endsWith("+xml"))); 732 } 733 734 private static final String RAW_EX_1 = 735 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 736 737 private static final String RAW_EX_2 = 738 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 739 740 private static final String HTTP_EX_1 = 741 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 742 743 private static final String HTTP_EX_2 = 744 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 745 746 private static final String HTTP_EX_3 = 747 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; 748 749 }