1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.io.input;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.charset.Charset;
31 import java.nio.charset.StandardCharsets;
32 import java.nio.file.Files;
33 import java.nio.file.Path;
34 import java.text.MessageFormat;
35 import java.util.Locale;
36 import java.util.Objects;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.io.ByteOrderMark;
41 import org.apache.commons.io.Charsets;
42 import org.apache.commons.io.IOUtils;
43 import org.apache.commons.io.build.AbstractStreamBuilder;
44 import org.apache.commons.io.function.IOConsumer;
45 import org.apache.commons.io.output.XmlStreamWriter;
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 public class XmlStreamReader extends Reader {
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
114
115 private boolean nullCharset = true;
116 private boolean lenient = true;
117 private String httpContentType;
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135 @SuppressWarnings("resource")
136 @Override
137 public XmlStreamReader get() throws IOException {
138 final String defaultEncoding = nullCharset ? null : getCharset().name();
139
140 return httpContentType == null
141 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
142 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
143
144 }
145
146 @Override
147 public Builder setCharset(final Charset charset) {
148 nullCharset = charset == null;
149 return super.setCharset(charset);
150 }
151
152 @Override
153 public Builder setCharset(final String charset) {
154 nullCharset = charset == null;
155 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
156 }
157
158
159
160
161
162
163
164 public Builder setHttpContentType(final String httpContentType) {
165 this.httpContentType = httpContentType;
166 return this;
167 }
168
169
170
171
172
173
174
175 public Builder setLenient(final boolean lenient) {
176 this.lenient = lenient;
177 return this;
178 }
179
180 }
181
182 private static final String UTF_8 = StandardCharsets.UTF_8.name();
183
184 private static final String US_ASCII = StandardCharsets.US_ASCII.name();
185
186 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
187
188 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
189
190 private static final String UTF_32BE = "UTF-32BE";
191
192 private static final String UTF_32LE = "UTF-32LE";
193
194 private static final String UTF_16 = StandardCharsets.UTF_16.name();
195
196 private static final String UTF_32 = "UTF-32";
197
198 private static final String EBCDIC = "CP1047";
199
200 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
201 ByteOrderMark.UTF_32LE };
202
203
204 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
205 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
206 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
207 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
208 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
209
210 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
211
212
213
214
215
216
217
218 public static final Pattern ENCODING_PATTERN = Pattern.compile(
219
220 "^<\\?xml\\s+"
221 + "version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+"
222 + "encoding\\s*=\\s*"
223 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"
224 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))",
225 Pattern.MULTILINE);
226
227
228
229
230
231
232 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
233
234 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
235
236 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
237
238 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
239
240 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
241
242
243
244
245
246
247
248 public static Builder builder() {
249 return new Builder();
250 }
251
252
253
254
255
256
257
258 static String getContentTypeEncoding(final String httpContentType) {
259 String encoding = null;
260 if (httpContentType != null) {
261 final int i = httpContentType.indexOf(";");
262 if (i > -1) {
263 final String postMime = httpContentType.substring(i + 1);
264 final Matcher m = CHARSET_PATTERN.matcher(postMime);
265 encoding = m.find() ? m.group(1) : null;
266 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
267 }
268 }
269 return encoding;
270 }
271
272
273
274
275
276
277
278 static String getContentTypeMime(final String httpContentType) {
279 String mime = null;
280 if (httpContentType != null) {
281 final int i = httpContentType.indexOf(";");
282 if (i >= 0) {
283 mime = httpContentType.substring(0, i);
284 } else {
285 mime = httpContentType;
286 }
287 mime = mime.trim();
288 }
289 return mime;
290 }
291
292
293
294
295
296
297
298
299
300 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
301 String encoding = null;
302 if (guessedEnc != null) {
303 final byte[] bytes = IOUtils.byteArray();
304 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
305 int offset = 0;
306 int max = IOUtils.DEFAULT_BUFFER_SIZE;
307 int c = inputStream.read(bytes, offset, max);
308 int firstGT = -1;
309 String xmlProlog = "";
310 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
311 offset += c;
312 max -= c;
313 c = inputStream.read(bytes, offset, max);
314 xmlProlog = new String(bytes, 0, offset, guessedEnc);
315 firstGT = xmlProlog.indexOf('>');
316 }
317 if (firstGT == -1) {
318 if (c == -1) {
319 throw new IOException("Unexpected end of XML stream");
320 }
321 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
322 }
323 final int bytesRead = offset;
324 if (bytesRead > 0) {
325 inputStream.reset();
326 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
327 final StringBuilder prolog = new StringBuilder();
328 IOConsumer.forEach(bReader.lines(), prolog::append);
329 final Matcher m = ENCODING_PATTERN.matcher(prolog);
330 if (m.find()) {
331 encoding = m.group(1).toUpperCase(Locale.ROOT);
332 encoding = encoding.substring(1, encoding.length() - 1);
333 }
334 }
335 }
336 return encoding;
337 }
338
339
340
341
342
343
344
345 static boolean isAppXml(final String mime) {
346 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
347 || mime.startsWith("application/") && mime.endsWith("+xml"));
348 }
349
350
351
352
353
354
355
356 static boolean isTextXml(final String mime) {
357 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
358 }
359
360 private final Reader reader;
361
362 private final String encoding;
363
364 private final String defaultEncoding;
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380 @Deprecated
381 public XmlStreamReader(final File file) throws IOException {
382 this(Objects.requireNonNull(file, "file").toPath());
383 }
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399 @Deprecated
400 public XmlStreamReader(final InputStream inputStream) throws IOException {
401 this(inputStream, true);
402 }
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435 @Deprecated
436 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
437 this(inputStream, lenient, null);
438 }
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472 @Deprecated
473 @SuppressWarnings("resource")
474 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
475 this.defaultEncoding = defaultEncoding;
476 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
477 false, BOMS);
478 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
479 this.encoding = processHttpStream(bom, pis, lenient);
480 this.reader = new InputStreamReader(pis, encoding);
481 }
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499 @Deprecated
500 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
501 this(inputStream, httpContentType, true);
502 }
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537 @Deprecated
538 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
539 this(inputStream, httpContentType, lenient, null);
540 }
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576 @Deprecated
577 @SuppressWarnings("resource")
578 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
579 throws IOException {
580 this.defaultEncoding = defaultEncoding;
581 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
582 false, BOMS);
583 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
584 this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
585 this.reader = new InputStreamReader(pis, encoding);
586 }
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603 @Deprecated
604 @SuppressWarnings("resource")
605 public XmlStreamReader(final Path file) throws IOException {
606 this(Files.newInputStream(Objects.requireNonNull(file, "file")));
607 }
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625 public XmlStreamReader(final URL url) throws IOException {
626 this(Objects.requireNonNull(url, "url").openConnection(), null);
627 }
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
648 Objects.requireNonNull(urlConnection, "urlConnection");
649 this.defaultEncoding = defaultEncoding;
650 final boolean lenient = true;
651 final String contentType = urlConnection.getContentType();
652 final InputStream inputStream = urlConnection.getInputStream();
653 @SuppressWarnings("resource")
654
655 final BOMInputStream bomInput = BOMInputStream.builder()
656 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
657 .setInclude(false)
658 .setByteOrderMarks(BOMS)
659 .get();
660 @SuppressWarnings("resource")
661 final BOMInputStream piInput = BOMInputStream.builder()
662 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
663 .setInclude(true)
664 .setByteOrderMarks(XML_GUESS_BYTES)
665 .get();
666
667 if (urlConnection instanceof HttpURLConnection || contentType != null) {
668 this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
669 } else {
670 this.encoding = processHttpStream(bomInput, piInput, lenient);
671 }
672 this.reader = new InputStreamReader(piInput, encoding);
673 }
674
675
676
677
678
679
680
681
682
683
684
685
686 String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
687 throws IOException {
688
689
690 if (lenient && xmlEnc != null) {
691 return xmlEnc;
692 }
693
694
695 final String cTMime = getContentTypeMime(httpContentType);
696 final String cTEnc = getContentTypeEncoding(httpContentType);
697 final boolean appXml = isAppXml(cTMime);
698 final boolean textXml = isTextXml(cTMime);
699
700
701 if (!appXml && !textXml) {
702 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
703 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
704 }
705
706
707 if (cTEnc == null) {
708 if (appXml) {
709 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
710 }
711 return defaultEncoding == null ? US_ASCII : defaultEncoding;
712 }
713
714
715 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
716 if (bomEnc != null) {
717 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
718 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
719 }
720 return cTEnc;
721 }
722
723
724 if (cTEnc.equals(UTF_16)) {
725 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
726 return bomEnc;
727 }
728 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
729 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
730 }
731
732
733 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
734 if (bomEnc != null) {
735 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
736 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
737 }
738 return cTEnc;
739 }
740
741
742 if (cTEnc.equals(UTF_32)) {
743 if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
744 return bomEnc;
745 }
746 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
747 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
748 }
749
750 return cTEnc;
751 }
752
753
754
755
756
757
758
759
760
761
762 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
763
764
765 if (bomEnc == null) {
766 if (xmlGuessEnc == null || xmlEnc == null) {
767 return defaultEncoding == null ? UTF_8 : defaultEncoding;
768 }
769 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
770 return xmlGuessEnc;
771 }
772 return xmlEnc;
773 }
774
775
776 if (bomEnc.equals(UTF_8)) {
777 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
778 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
779 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
780 }
781 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
782 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
783 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
784 }
785 return bomEnc;
786 }
787
788
789 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
790 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
791 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
792 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
793 }
794 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
795 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
796 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
797 }
798 return bomEnc;
799 }
800
801
802 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
803 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
804 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
805 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
806 }
807 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
808 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
809 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
810 }
811 return bomEnc;
812 }
813
814
815 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
816 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
817 }
818
819
820
821
822
823
824 @Override
825 public void close() throws IOException {
826 reader.close();
827 }
828
829
830
831
832
833
834
835
836
837 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
838 if (httpContentType != null && httpContentType.startsWith("text/html")) {
839 httpContentType = httpContentType.substring("text/html".length());
840 httpContentType = "text/xml" + httpContentType;
841 try {
842 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
843 } catch (final XmlStreamReaderException ex2) {
844 ex = ex2;
845 }
846 }
847 String encoding = ex.getXmlEncoding();
848 if (encoding == null) {
849 encoding = ex.getContentTypeEncoding();
850 }
851 if (encoding == null) {
852 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
853 }
854 return encoding;
855 }
856
857
858
859
860
861
862
863
864
865 public String getDefaultEncoding() {
866 return defaultEncoding;
867 }
868
869
870
871
872
873
874 public String getEncoding() {
875 return encoding;
876 }
877
878
879
880
881
882
883
884
885
886
887 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
888 final String bomEnc = bomInput.getBOMCharsetName();
889 final String xmlGuessEnc = piInput.getBOMCharsetName();
890 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
891 try {
892 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
893 } catch (final XmlStreamReaderException ex) {
894 if (lenient) {
895 return doLenientDetection(null, ex);
896 }
897 throw ex;
898 }
899 }
900
901
902
903
904
905
906
907
908
909
910
911 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
912 throws IOException {
913 final String bomEnc = bomInput.getBOMCharsetName();
914 final String xmlGuessEnc = piInput.getBOMCharsetName();
915 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
916 try {
917 return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
918 } catch (final XmlStreamReaderException ex) {
919 if (lenient) {
920 return doLenientDetection(httpContentType, ex);
921 }
922 throw ex;
923 }
924 }
925
926
927
928
929
930
931
932
933
934
935 @Override
936 public int read(final char[] buf, final int offset, final int len) throws IOException {
937 return reader.read(buf, offset, len);
938 }
939
940 }