1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.io.input;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.charset.Charset;
31 import java.nio.charset.StandardCharsets;
32 import java.nio.file.Files;
33 import java.nio.file.Path;
34 import java.text.MessageFormat;
35 import java.util.Locale;
36 import java.util.Objects;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.io.ByteOrderMark;
41 import org.apache.commons.io.Charsets;
42 import org.apache.commons.io.IOUtils;
43 import org.apache.commons.io.build.AbstractStreamBuilder;
44 import org.apache.commons.io.function.IOConsumer;
45 import org.apache.commons.io.output.XmlStreamWriter;
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75 public class XmlStreamReader extends Reader {
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
121
122 private boolean nullCharset = true;
123 private boolean lenient = true;
124 private String httpContentType;
125
126
127
128
129 public Builder() {
130
131 }
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156 @Override
157 public XmlStreamReader get() throws IOException {
158 final String defaultEncoding = nullCharset ? null : getCharset().name();
159
160 return httpContentType == null
161 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
162 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
163
164 }
165
166 @Override
167 public Builder setCharset(final Charset charset) {
168 nullCharset = charset == null;
169 return super.setCharset(charset);
170 }
171
172 @Override
173 public Builder setCharset(final String charset) {
174 nullCharset = charset == null;
175 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
176 }
177
178
179
180
181
182
183
184 public Builder setHttpContentType(final String httpContentType) {
185 this.httpContentType = httpContentType;
186 return this;
187 }
188
189
190
191
192
193
194
195 public Builder setLenient(final boolean lenient) {
196 this.lenient = lenient;
197 return this;
198 }
199
200 }
201
202 private static final String UTF_8 = StandardCharsets.UTF_8.name();
203
204 private static final String US_ASCII = StandardCharsets.US_ASCII.name();
205
206 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
207
208 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
209
210 private static final String UTF_32BE = "UTF-32BE";
211
212 private static final String UTF_32LE = "UTF-32LE";
213
214 private static final String UTF_16 = StandardCharsets.UTF_16.name();
215
216 private static final String UTF_32 = "UTF-32";
217
218 private static final String EBCDIC = "CP1047";
219
220 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
221 ByteOrderMark.UTF_32LE };
222
223
224 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
225 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
226 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
227 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
228 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
229
230 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248 public static final Pattern ENCODING_PATTERN = Pattern.compile(
249
250 "^<\\?xml\\s+"
251 + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
252 + "encoding\\s*=\\s*"
253 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"
254 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))",
255 Pattern.MULTILINE);
256
257
258 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
259
260 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
261
262 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
263
264 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
265
266 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
267
268
269
270
271
272
273
274 public static Builder builder() {
275 return new Builder();
276 }
277
278
279
280
281
282
283
284 static String getContentTypeEncoding(final String httpContentType) {
285 String encoding = null;
286 if (httpContentType != null) {
287 final int i = httpContentType.indexOf(";");
288 if (i > -1) {
289 final String postMime = httpContentType.substring(i + 1);
290 final Matcher m = CHARSET_PATTERN.matcher(postMime);
291 encoding = m.find() ? m.group(1) : null;
292 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
293 }
294 }
295 return encoding;
296 }
297
298
299
300
301
302
303
304 static String getContentTypeMime(final String httpContentType) {
305 String mime = null;
306 if (httpContentType != null) {
307 final int i = httpContentType.indexOf(";");
308 mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
309 mime = mime.trim();
310 }
311 return mime;
312 }
313
314
315
316
317
318
319
320
321
322 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
323 String encoding = null;
324 if (guessedEnc != null) {
325 final byte[] bytes = IOUtils.byteArray();
326 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
327 int offset = 0;
328 int max = IOUtils.DEFAULT_BUFFER_SIZE;
329 int c = inputStream.read(bytes, offset, max);
330 int firstGT = -1;
331 String xmlProlog = "";
332 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
333 offset += c;
334 max -= c;
335 c = inputStream.read(bytes, offset, max);
336 xmlProlog = new String(bytes, 0, offset, guessedEnc);
337 firstGT = xmlProlog.indexOf('>');
338 }
339 if (firstGT == -1) {
340 if (c == -1) {
341 throw new IOException("Unexpected end of XML stream");
342 }
343 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
344 }
345 final int bytesRead = offset;
346 if (bytesRead > 0) {
347 inputStream.reset();
348 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
349 final StringBuilder prolog = new StringBuilder();
350 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
351 final Matcher m = ENCODING_PATTERN.matcher(prolog);
352 if (m.find()) {
353 encoding = m.group(1).toUpperCase(Locale.ROOT);
354 encoding = encoding.substring(1, encoding.length() - 1);
355 }
356 }
357 }
358 return encoding;
359 }
360
361
362
363
364
365
366
367 static boolean isAppXml(final String mime) {
368 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
369 || mime.startsWith("application/") && mime.endsWith("+xml"));
370 }
371
372
373
374
375
376
377
378 static boolean isTextXml(final String mime) {
379 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
380 }
381
382 private final Reader reader;
383
384 private final String encoding;
385
386 private final String defaultEncoding;
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402 @Deprecated
403 public XmlStreamReader(final File file) throws IOException {
404 this(Objects.requireNonNull(file, "file").toPath());
405 }
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421 @Deprecated
422 public XmlStreamReader(final InputStream inputStream) throws IOException {
423 this(inputStream, true);
424 }
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457 @Deprecated
458 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
459 this(inputStream, lenient, null);
460 }
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494 @Deprecated
495 @SuppressWarnings("resource")
496 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
497 this.defaultEncoding = defaultEncoding;
498 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
499 false, BOMS);
500 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
501 this.encoding = processHttpStream(bom, pis, lenient);
502 this.reader = new InputStreamReader(pis, encoding);
503 }
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521 @Deprecated
522 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
523 this(inputStream, httpContentType, true);
524 }
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559 @Deprecated
560 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
561 this(inputStream, httpContentType, lenient, null);
562 }
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598 @Deprecated
599 @SuppressWarnings("resource")
600 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
601 throws IOException {
602 this.defaultEncoding = defaultEncoding;
603 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
604 false, BOMS);
605 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
606 this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
607 this.reader = new InputStreamReader(pis, encoding);
608 }
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625 @Deprecated
626 @SuppressWarnings("resource")
627 public XmlStreamReader(final Path file) throws IOException {
628 this(Files.newInputStream(Objects.requireNonNull(file, "file")));
629 }
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647 public XmlStreamReader(final URL url) throws IOException {
648 this(Objects.requireNonNull(url, "url").openConnection(), null);
649 }
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
670 Objects.requireNonNull(urlConnection, "urlConnection");
671 this.defaultEncoding = defaultEncoding;
672 final boolean lenient = true;
673 final String contentType = urlConnection.getContentType();
674 final InputStream inputStream = urlConnection.getInputStream();
675 @SuppressWarnings("resource")
676
677 final BOMInputStream bomInput = BOMInputStream.builder()
678 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
679 .setInclude(false)
680 .setByteOrderMarks(BOMS)
681 .get();
682 @SuppressWarnings("resource")
683 final BOMInputStream piInput = BOMInputStream.builder()
684 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
685 .setInclude(true)
686 .setByteOrderMarks(XML_GUESS_BYTES)
687 .get();
688
689 if (urlConnection instanceof HttpURLConnection || contentType != null) {
690 this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
691 } else {
692 this.encoding = processHttpStream(bomInput, piInput, lenient);
693 }
694 this.reader = new InputStreamReader(piInput, encoding);
695 }
696
697
698
699
700
701
702
703
704
705
706
707 String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
708 throws IOException {
709
710
711 if (lenient && xmlEnc != null) {
712 return xmlEnc;
713 }
714
715
716 final String cTMime = getContentTypeMime(httpContentType);
717 final String cTEnc = getContentTypeEncoding(httpContentType);
718 final boolean appXml = isAppXml(cTMime);
719 final boolean textXml = isTextXml(cTMime);
720
721
722 if (!appXml && !textXml) {
723 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
724 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
725 }
726
727
728 if (cTEnc == null) {
729 if (appXml) {
730 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
731 }
732 return defaultEncoding == null ? US_ASCII : defaultEncoding;
733 }
734
735
736 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
737 if (bomEnc != null) {
738 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
739 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
740 }
741 return cTEnc;
742 }
743
744
745 if (cTEnc.equals(UTF_16)) {
746 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
747 return bomEnc;
748 }
749 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
750 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
751 }
752
753
754 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
755 if (bomEnc != null) {
756 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
757 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
758 }
759 return cTEnc;
760 }
761
762
763 if (cTEnc.equals(UTF_32)) {
764 if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
765 return bomEnc;
766 }
767 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
768 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
769 }
770
771 return cTEnc;
772 }
773
774
775
776
777
778
779
780
781
782
783 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
784
785
786 if (bomEnc == null) {
787 if (xmlGuessEnc == null || xmlEnc == null) {
788 return defaultEncoding == null ? UTF_8 : defaultEncoding;
789 }
790 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
791 return xmlGuessEnc;
792 }
793 return xmlEnc;
794 }
795
796
797 if (bomEnc.equals(UTF_8)) {
798 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
799 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
800 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
801 }
802 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
803 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
804 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
805 }
806 return bomEnc;
807 }
808
809
810 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
811 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
812 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
813 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
814 }
815 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
816 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
817 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
818 }
819 return bomEnc;
820 }
821
822
823 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
824 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
825 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
826 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
827 }
828 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
829 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
830 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
831 }
832 return bomEnc;
833 }
834
835
836 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
837 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
838 }
839
840
841
842
843
844
845 @Override
846 public void close() throws IOException {
847 reader.close();
848 }
849
850
851
852
853
854
855
856
857
858 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
859 if (httpContentType != null && httpContentType.startsWith("text/html")) {
860 httpContentType = httpContentType.substring("text/html".length());
861 httpContentType = "text/xml" + httpContentType;
862 try {
863 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
864 } catch (final XmlStreamReaderException ex2) {
865 ex = ex2;
866 }
867 }
868 String encoding = ex.getXmlEncoding();
869 if (encoding == null) {
870 encoding = ex.getContentTypeEncoding();
871 }
872 if (encoding == null) {
873 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
874 }
875 return encoding;
876 }
877
878
879
880
881
882
883
884
885
886 public String getDefaultEncoding() {
887 return defaultEncoding;
888 }
889
890
891
892
893
894
895 public String getEncoding() {
896 return encoding;
897 }
898
899
900
901
902
903
904
905
906
907
908 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
909 final String bomEnc = bomInput.getBOMCharsetName();
910 final String xmlGuessEnc = piInput.getBOMCharsetName();
911 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
912 try {
913 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
914 } catch (final XmlStreamReaderException ex) {
915 if (lenient) {
916 return doLenientDetection(null, ex);
917 }
918 throw ex;
919 }
920 }
921
922
923
924
925
926
927
928
929
930
931
932 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
933 throws IOException {
934 final String bomEnc = bomInput.getBOMCharsetName();
935 final String xmlGuessEnc = piInput.getBOMCharsetName();
936 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
937 try {
938 return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
939 } catch (final XmlStreamReaderException ex) {
940 if (lenient) {
941 return doLenientDetection(httpContentType, ex);
942 }
943 throw ex;
944 }
945 }
946
947
948
949
950
951
952
953
954
955
956 @Override
957 public int read(final char[] buf, final int offset, final int len) throws IOException {
958 return reader.read(buf, offset, len);
959 }
960
961 }