001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.io.input; 018 019 import java.io.BufferedInputStream; 020 import java.io.BufferedReader; 021 import java.io.File; 022 import java.io.FileInputStream; 023 import java.io.IOException; 024 import java.io.InputStream; 025 import java.io.InputStreamReader; 026 import java.io.Reader; 027 import java.io.StringReader; 028 import java.net.HttpURLConnection; 029 import java.net.URL; 030 import java.net.URLConnection; 031 import java.text.MessageFormat; 032 import java.util.Locale; 033 import java.util.regex.Matcher; 034 import java.util.regex.Pattern; 035 036 import org.apache.commons.io.ByteOrderMark; 037 038 /** 039 * Character stream that handles all the necessary Voodo to figure out the 040 * charset encoding of the XML document within the stream. 041 * <p> 042 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 043 * This one IS a character stream. 044 * <p> 045 * All this has to be done without consuming characters from the stream, if not 046 * the XML parser will not recognized the document as a valid XML. This is not 047 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 048 * right now, XmlStreamReader handles it and things work in all parsers). 049 * <p> 050 * The XmlStreamReader class handles the charset encoding of XML documents in 051 * Files, raw streams and HTTP streams by offering a wide set of constructors. 052 * <p> 053 * By default the charset encoding detection is lenient, the constructor with 054 * the lenient flag can be used for an script (following HTTP MIME and XML 055 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 056 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 057 * Determining the character encoding of a feed</a>. 058 * <p> 059 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 060 * Apache License 2.0. 061 * 062 * @version $Id: XmlStreamReader.java 1304052 2012-03-22 20:55:29Z ggregory $ 063 * @see org.apache.commons.io.output.XmlStreamWriter 064 * @since 2.0 065 */ 066 public class XmlStreamReader extends Reader { 067 private static final int BUFFER_SIZE = 4096; 068 069 private static final String UTF_8 = "UTF-8"; 070 071 private static final String US_ASCII = "US-ASCII"; 072 073 private static final String UTF_16BE = "UTF-16BE"; 074 075 private static final String UTF_16LE = "UTF-16LE"; 076 077 private static final String UTF_16 = "UTF-16"; 078 079 private static final String EBCDIC = "CP1047"; 080 081 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] { 082 ByteOrderMark.UTF_8, 083 ByteOrderMark.UTF_16BE, 084 ByteOrderMark.UTF_16LE 085 }; 086 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] { 087 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 088 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), 089 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 090 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) 091 }; 092 093 094 private final Reader reader; 095 096 private final String encoding; 097 098 private final String defaultEncoding; 099 100 /** 101 * Returns the default encoding to use if none is set in HTTP content-type, 102 * XML prolog and the rules based on content-type are not adequate. 103 * <p> 104 * If it is NULL the content-type based rules are used. 105 * 106 * @return the default encoding to use. 107 */ 108 public String getDefaultEncoding() { 109 return defaultEncoding; 110 } 111 112 /** 113 * Creates a Reader for a File. 114 * <p> 115 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 116 * if this is also missing defaults to UTF-8. 117 * <p> 118 * It does a lenient charset encoding detection, check the constructor with 119 * the lenient parameter for details. 120 * 121 * @param file File to create a Reader from. 122 * @throws IOException thrown if there is a problem reading the file. 123 */ 124 public XmlStreamReader(File file) throws IOException { 125 this(new FileInputStream(file)); 126 } 127 128 /** 129 * Creates a Reader for a raw InputStream. 130 * <p> 131 * It follows the same logic used for files. 132 * <p> 133 * It does a lenient charset encoding detection, check the constructor with 134 * the lenient parameter for details. 135 * 136 * @param is InputStream to create a Reader from. 137 * @throws IOException thrown if there is a problem reading the stream. 138 */ 139 public XmlStreamReader(InputStream is) throws IOException { 140 this(is, true); 141 } 142 143 /** 144 * Creates a Reader for a raw InputStream. 145 * <p> 146 * It follows the same logic used for files. 147 * <p> 148 * If lenient detection is indicated and the detection above fails as per 149 * specifications it then attempts the following: 150 * <p> 151 * If the content type was 'text/html' it replaces it with 'text/xml' and 152 * tries the detection again. 153 * <p> 154 * Else if the XML prolog had a charset encoding that encoding is used. 155 * <p> 156 * Else if the content type had a charset encoding that encoding is used. 157 * <p> 158 * Else 'UTF-8' is used. 159 * <p> 160 * If lenient detection is indicated an XmlStreamReaderException is never 161 * thrown. 162 * 163 * @param is InputStream to create a Reader from. 164 * @param lenient indicates if the charset encoding detection should be 165 * relaxed. 166 * @throws IOException thrown if there is a problem reading the stream. 167 * @throws XmlStreamReaderException thrown if the charset encoding could not 168 * be determined according to the specs. 169 */ 170 public XmlStreamReader(InputStream is, boolean lenient) throws IOException { 171 this(is, lenient, null); 172 } 173 174 /** 175 * Creates a Reader for a raw InputStream. 176 * <p> 177 * It follows the same logic used for files. 178 * <p> 179 * If lenient detection is indicated and the detection above fails as per 180 * specifications it then attempts the following: 181 * <p> 182 * If the content type was 'text/html' it replaces it with 'text/xml' and 183 * tries the detection again. 184 * <p> 185 * Else if the XML prolog had a charset encoding that encoding is used. 186 * <p> 187 * Else if the content type had a charset encoding that encoding is used. 188 * <p> 189 * Else 'UTF-8' is used. 190 * <p> 191 * If lenient detection is indicated an XmlStreamReaderException is never 192 * thrown. 193 * 194 * @param is InputStream to create a Reader from. 195 * @param lenient indicates if the charset encoding detection should be 196 * relaxed. 197 * @param defaultEncoding The default encoding 198 * @throws IOException thrown if there is a problem reading the stream. 199 * @throws XmlStreamReaderException thrown if the charset encoding could not 200 * be determined according to the specs. 201 */ 202 public XmlStreamReader(InputStream is, boolean lenient, String defaultEncoding) throws IOException { 203 this.defaultEncoding = defaultEncoding; 204 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 205 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 206 this.encoding = doRawStream(bom, pis, lenient); 207 this.reader = new InputStreamReader(pis, encoding); 208 } 209 210 /** 211 * Creates a Reader using the InputStream of a URL. 212 * <p> 213 * If the URL is not of type HTTP and there is not 'content-type' header in 214 * the fetched data it uses the same logic used for Files. 215 * <p> 216 * If the URL is a HTTP Url or there is a 'content-type' header in the 217 * fetched data it uses the same logic used for an InputStream with 218 * content-type. 219 * <p> 220 * It does a lenient charset encoding detection, check the constructor with 221 * the lenient parameter for details. 222 * 223 * @param url URL to create a Reader from. 224 * @throws IOException thrown if there is a problem reading the stream of 225 * the URL. 226 */ 227 public XmlStreamReader(URL url) throws IOException { 228 this(url.openConnection(), null); 229 } 230 231 /** 232 * Creates a Reader using the InputStream of a URLConnection. 233 * <p> 234 * If the URLConnection is not of type HttpURLConnection and there is not 235 * 'content-type' header in the fetched data it uses the same logic used for 236 * files. 237 * <p> 238 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 239 * the fetched data it uses the same logic used for an InputStream with 240 * content-type. 241 * <p> 242 * It does a lenient charset encoding detection, check the constructor with 243 * the lenient parameter for details. 244 * 245 * @param conn URLConnection to create a Reader from. 246 * @param defaultEncoding The default encoding 247 * @throws IOException thrown if there is a problem reading the stream of 248 * the URLConnection. 249 */ 250 public XmlStreamReader(URLConnection conn, String defaultEncoding) throws IOException { 251 this.defaultEncoding = defaultEncoding; 252 boolean lenient = true; 253 String contentType = conn.getContentType(); 254 InputStream is = conn.getInputStream(); 255 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 256 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 257 if (conn instanceof HttpURLConnection || contentType != null) { 258 this.encoding = doHttpStream(bom, pis, contentType, lenient); 259 } else { 260 this.encoding = doRawStream(bom, pis, lenient); 261 } 262 this.reader = new InputStreamReader(pis, encoding); 263 } 264 265 /** 266 * Creates a Reader using an InputStream an the associated content-type 267 * header. 268 * <p> 269 * First it checks if the stream has BOM. If there is not BOM checks the 270 * content-type encoding. If there is not content-type encoding checks the 271 * XML prolog encoding. If there is not XML prolog encoding uses the default 272 * encoding mandated by the content-type MIME type. 273 * <p> 274 * It does a lenient charset encoding detection, check the constructor with 275 * the lenient parameter for details. 276 * 277 * @param is InputStream to create the reader from. 278 * @param httpContentType content-type header to use for the resolution of 279 * the charset encoding. 280 * @throws IOException thrown if there is a problem reading the file. 281 */ 282 public XmlStreamReader(InputStream is, String httpContentType) 283 throws IOException { 284 this(is, httpContentType, true); 285 } 286 287 /** 288 * Creates a Reader using an InputStream an the associated content-type 289 * header. This constructor is lenient regarding the encoding detection. 290 * <p> 291 * First it checks if the stream has BOM. If there is not BOM checks the 292 * content-type encoding. If there is not content-type encoding checks the 293 * XML prolog encoding. If there is not XML prolog encoding uses the default 294 * encoding mandated by the content-type MIME type. 295 * <p> 296 * If lenient detection is indicated and the detection above fails as per 297 * specifications it then attempts the following: 298 * <p> 299 * If the content type was 'text/html' it replaces it with 'text/xml' and 300 * tries the detection again. 301 * <p> 302 * Else if the XML prolog had a charset encoding that encoding is used. 303 * <p> 304 * Else if the content type had a charset encoding that encoding is used. 305 * <p> 306 * Else 'UTF-8' is used. 307 * <p> 308 * If lenient detection is indicated an XmlStreamReaderException is never 309 * thrown. 310 * 311 * @param is InputStream to create the reader from. 312 * @param httpContentType content-type header to use for the resolution of 313 * the charset encoding. 314 * @param lenient indicates if the charset encoding detection should be 315 * relaxed. 316 * @param defaultEncoding The default encoding 317 * @throws IOException thrown if there is a problem reading the file. 318 * @throws XmlStreamReaderException thrown if the charset encoding could not 319 * be determined according to the specs. 320 */ 321 public XmlStreamReader(InputStream is, String httpContentType, 322 boolean lenient, String defaultEncoding) throws IOException { 323 this.defaultEncoding = defaultEncoding; 324 BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 325 BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 326 this.encoding = doHttpStream(bom, pis, httpContentType, lenient); 327 this.reader = new InputStreamReader(pis, encoding); 328 } 329 330 /** 331 * Creates a Reader using an InputStream an the associated content-type 332 * header. This constructor is lenient regarding the encoding detection. 333 * <p> 334 * First it checks if the stream has BOM. If there is not BOM checks the 335 * content-type encoding. If there is not content-type encoding checks the 336 * XML prolog encoding. If there is not XML prolog encoding uses the default 337 * encoding mandated by the content-type MIME type. 338 * <p> 339 * If lenient detection is indicated and the detection above fails as per 340 * specifications it then attempts the following: 341 * <p> 342 * If the content type was 'text/html' it replaces it with 'text/xml' and 343 * tries the detection again. 344 * <p> 345 * Else if the XML prolog had a charset encoding that encoding is used. 346 * <p> 347 * Else if the content type had a charset encoding that encoding is used. 348 * <p> 349 * Else 'UTF-8' is used. 350 * <p> 351 * If lenient detection is indicated an XmlStreamReaderException is never 352 * thrown. 353 * 354 * @param is InputStream to create the reader from. 355 * @param httpContentType content-type header to use for the resolution of 356 * the charset encoding. 357 * @param lenient indicates if the charset encoding detection should be 358 * relaxed. 359 * @throws IOException thrown if there is a problem reading the file. 360 * @throws XmlStreamReaderException thrown if the charset encoding could not 361 * be determined according to the specs. 362 */ 363 public XmlStreamReader(InputStream is, String httpContentType, 364 boolean lenient) throws IOException { 365 this(is, httpContentType, lenient, null); 366 } 367 368 /** 369 * Returns the charset encoding of the XmlStreamReader. 370 * 371 * @return charset encoding. 372 */ 373 public String getEncoding() { 374 return encoding; 375 } 376 377 /** 378 * Invokes the underlying reader's <code>read(char[], int, int)</code> method. 379 * @param buf the buffer to read the characters into 380 * @param offset The start offset 381 * @param len The number of bytes to read 382 * @return the number of characters read or -1 if the end of stream 383 * @throws IOException if an I/O error occurs 384 */ 385 @Override 386 public int read(char[] buf, int offset, int len) throws IOException { 387 return reader.read(buf, offset, len); 388 } 389 390 /** 391 * Closes the XmlStreamReader stream. 392 * 393 * @throws IOException thrown if there was a problem closing the stream. 394 */ 395 @Override 396 public void close() throws IOException { 397 reader.close(); 398 } 399 400 /** 401 * Process the raw stream. 402 * 403 * @param bom BOMInputStream to detect byte order marks 404 * @param pis BOMInputStream to guess XML encoding 405 * @param lenient indicates if the charset encoding detection should be 406 * relaxed. 407 * @return the encoding to be used 408 * @throws IOException thrown if there is a problem reading the stream. 409 */ 410 private String doRawStream(BOMInputStream bom, BOMInputStream pis, boolean lenient) 411 throws IOException { 412 String bomEnc = bom.getBOMCharsetName(); 413 String xmlGuessEnc = pis.getBOMCharsetName(); 414 String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 415 try { 416 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 417 } catch (XmlStreamReaderException ex) { 418 if (lenient) { 419 return doLenientDetection(null, ex); 420 } else { 421 throw ex; 422 } 423 } 424 } 425 426 /** 427 * Process a HTTP stream. 428 * 429 * @param bom BOMInputStream to detect byte order marks 430 * @param pis BOMInputStream to guess XML encoding 431 * @param httpContentType The HTTP content type 432 * @param lenient indicates if the charset encoding detection should be 433 * relaxed. 434 * @return the encoding to be used 435 * @throws IOException thrown if there is a problem reading the stream. 436 */ 437 private String doHttpStream(BOMInputStream bom, BOMInputStream pis, String httpContentType, 438 boolean lenient) throws IOException { 439 String bomEnc = bom.getBOMCharsetName(); 440 String xmlGuessEnc = pis.getBOMCharsetName(); 441 String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 442 try { 443 return calculateHttpEncoding(httpContentType, bomEnc, 444 xmlGuessEnc, xmlEnc, lenient); 445 } catch (XmlStreamReaderException ex) { 446 if (lenient) { 447 return doLenientDetection(httpContentType, ex); 448 } else { 449 throw ex; 450 } 451 } 452 } 453 454 /** 455 * Do lenient detection. 456 * 457 * @param httpContentType content-type header to use for the resolution of 458 * the charset encoding. 459 * @param ex The thrown exception 460 * @return the encoding 461 * @throws IOException thrown if there is a problem reading the stream. 462 */ 463 private String doLenientDetection(String httpContentType, 464 XmlStreamReaderException ex) throws IOException { 465 if (httpContentType != null && httpContentType.startsWith("text/html")) { 466 httpContentType = httpContentType.substring("text/html".length()); 467 httpContentType = "text/xml" + httpContentType; 468 try { 469 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), 470 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 471 } catch (XmlStreamReaderException ex2) { 472 ex = ex2; 473 } 474 } 475 String encoding = ex.getXmlEncoding(); 476 if (encoding == null) { 477 encoding = ex.getContentTypeEncoding(); 478 } 479 if (encoding == null) { 480 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 481 } 482 return encoding; 483 } 484 485 /** 486 * Calculate the raw encoding. 487 * 488 * @param bomEnc BOM encoding 489 * @param xmlGuessEnc XML Guess encoding 490 * @param xmlEnc XML encoding 491 * @return the raw encoding 492 * @throws IOException thrown if there is a problem reading the stream. 493 */ 494 String calculateRawEncoding(String bomEnc, String xmlGuessEnc, 495 String xmlEnc) throws IOException { 496 497 // BOM is Null 498 if (bomEnc == null) { 499 if (xmlGuessEnc == null || xmlEnc == null) { 500 return defaultEncoding == null ? UTF_8 : defaultEncoding; 501 } 502 if (xmlEnc.equals(UTF_16) && 503 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 504 return xmlGuessEnc; 505 } 506 return xmlEnc; 507 } 508 509 // BOM is UTF-8 510 if (bomEnc.equals(UTF_8)) { 511 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 512 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 513 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 514 } 515 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 516 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 517 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 518 } 519 return bomEnc; 520 } 521 522 // BOM is UTF-16BE or UTF-16LE 523 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 524 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 525 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 526 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 527 } 528 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 529 String msg = MessageFormat.format(RAW_EX_1, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 530 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 531 } 532 return bomEnc; 533 } 534 535 // BOM is something else 536 String msg = MessageFormat.format(RAW_EX_2, new Object[] { bomEnc, xmlGuessEnc, xmlEnc }); 537 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 538 } 539 540 541 /** 542 * Calculate the HTTP encoding. 543 * 544 * @param httpContentType The HTTP content type 545 * @param bomEnc BOM encoding 546 * @param xmlGuessEnc XML Guess encoding 547 * @param xmlEnc XML encoding 548 * @param lenient indicates if the charset encoding detection should be 549 * relaxed. 550 * @return the HTTP encoding 551 * @throws IOException thrown if there is a problem reading the stream. 552 */ 553 String calculateHttpEncoding(String httpContentType, 554 String bomEnc, String xmlGuessEnc, String xmlEnc, 555 boolean lenient) throws IOException { 556 557 // Lenient and has XML encoding 558 if (lenient && xmlEnc != null) { 559 return xmlEnc; 560 } 561 562 // Determine mime/encoding content types from HTTP Content Type 563 String cTMime = getContentTypeMime(httpContentType); 564 String cTEnc = getContentTypeEncoding(httpContentType); 565 boolean appXml = isAppXml(cTMime); 566 boolean textXml = isTextXml(cTMime); 567 568 // Mime type NOT "application/xml" or "text/xml" 569 if (!appXml && !textXml) { 570 String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 571 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 572 } 573 574 // No content type encoding 575 if (cTEnc == null) { 576 if (appXml) { 577 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 578 } else { 579 return defaultEncoding == null ? US_ASCII : defaultEncoding; 580 } 581 } 582 583 // UTF-16BE or UTF-16LE content type encoding 584 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 585 if (bomEnc != null) { 586 String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 587 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 588 } 589 return cTEnc; 590 } 591 592 // UTF-16 content type encoding 593 if (cTEnc.equals(UTF_16)) { 594 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 595 return bomEnc; 596 } 597 String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 598 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 599 } 600 601 return cTEnc; 602 } 603 604 /** 605 * Returns MIME type or NULL if httpContentType is NULL. 606 * 607 * @param httpContentType the HTTP content type 608 * @return The mime content type 609 */ 610 static String getContentTypeMime(String httpContentType) { 611 String mime = null; 612 if (httpContentType != null) { 613 int i = httpContentType.indexOf(";"); 614 if (i >= 0) { 615 mime = httpContentType.substring(0, i); 616 } else { 617 mime = httpContentType; 618 } 619 mime = mime.trim(); 620 } 621 return mime; 622 } 623 624 private static final Pattern CHARSET_PATTERN = Pattern 625 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 626 627 /** 628 * Returns charset parameter value, NULL if not present, NULL if 629 * httpContentType is NULL. 630 * 631 * @param httpContentType the HTTP content type 632 * @return The content type encoding (upcased) 633 */ 634 static String getContentTypeEncoding(String httpContentType) { 635 String encoding = null; 636 if (httpContentType != null) { 637 int i = httpContentType.indexOf(";"); 638 if (i > -1) { 639 String postMime = httpContentType.substring(i + 1); 640 Matcher m = CHARSET_PATTERN.matcher(postMime); 641 encoding = m.find() ? m.group(1) : null; 642 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null; 643 } 644 } 645 return encoding; 646 } 647 648 public static final Pattern ENCODING_PATTERN = Pattern.compile( 649 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 650 Pattern.MULTILINE); 651 652 /** 653 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. 654 * 655 * @param is InputStream to create the reader from. 656 * @param guessedEnc guessed encoding 657 * @return the encoding declared in the <?xml encoding=...?> 658 * @throws IOException thrown if there is a problem reading the stream. 659 */ 660 private static String getXmlProlog(InputStream is, String guessedEnc) 661 throws IOException { 662 String encoding = null; 663 if (guessedEnc != null) { 664 byte[] bytes = new byte[BUFFER_SIZE]; 665 is.mark(BUFFER_SIZE); 666 int offset = 0; 667 int max = BUFFER_SIZE; 668 int c = is.read(bytes, offset, max); 669 int firstGT = -1; 670 String xmlProlog = null; 671 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { 672 offset += c; 673 max -= c; 674 c = is.read(bytes, offset, max); 675 xmlProlog = new String(bytes, 0, offset, guessedEnc); 676 firstGT = xmlProlog.indexOf('>'); 677 } 678 if (firstGT == -1) { 679 if (c == -1) { 680 throw new IOException("Unexpected end of XML stream"); 681 } else { 682 throw new IOException( 683 "XML prolog or ROOT element not found on first " 684 + offset + " bytes"); 685 } 686 } 687 int bytesRead = offset; 688 if (bytesRead > 0) { 689 is.reset(); 690 BufferedReader bReader = new BufferedReader(new StringReader( 691 xmlProlog.substring(0, firstGT + 1))); 692 StringBuffer prolog = new StringBuffer(); 693 String line = bReader.readLine(); 694 while (line != null) { 695 prolog.append(line); 696 line = bReader.readLine(); 697 } 698 Matcher m = ENCODING_PATTERN.matcher(prolog); 699 if (m.find()) { 700 encoding = m.group(1).toUpperCase(); 701 encoding = encoding.substring(1, encoding.length() - 1); 702 } 703 } 704 } 705 return encoding; 706 } 707 708 /** 709 * Indicates if the MIME type belongs to the APPLICATION XML family. 710 * 711 * @param mime The mime type 712 * @return true if the mime type belongs to the APPLICATION XML family, 713 * otherwise false 714 */ 715 static boolean isAppXml(String mime) { 716 return mime != null && 717 (mime.equals("application/xml") || 718 mime.equals("application/xml-dtd") || 719 mime.equals("application/xml-external-parsed-entity") || 720 mime.startsWith("application/") && mime.endsWith("+xml")); 721 } 722 723 /** 724 * Indicates if the MIME type belongs to the TEXT XML family. 725 * 726 * @param mime The mime type 727 * @return true if the mime type belongs to the TEXT XML family, 728 * otherwise false 729 */ 730 static boolean isTextXml(String mime) { 731 return mime != null && 732 (mime.equals("text/xml") || 733 mime.equals("text/xml-external-parsed-entity") || 734 mime.startsWith("text/") && mime.endsWith("+xml")); 735 } 736 737 private static final String RAW_EX_1 = 738 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 739 740 private static final String RAW_EX_2 = 741 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 742 743 private static final String HTTP_EX_1 = 744 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 745 746 private static final String HTTP_EX_2 = 747 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 748 749 private static final String HTTP_EX_3 = 750 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; 751 752 }