1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.io.input;
18
19 import static org.apache.commons.io.IOUtils.EOF;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Arrays;
24 import java.util.Comparator;
25 import java.util.List;
26 import java.util.Objects;
27
28 import org.apache.commons.io.ByteOrderMark;
29 import org.apache.commons.io.IOUtils;
30
31 /**
32 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
33 * <p>
34 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
35 * first byte in the stream.
36 * </p>
37 * <p>
38 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
39 * </p>
40 * <ul>
41 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
42 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
43 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
44 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
45 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
46 * </ul>
47 * <p>
48 * To build an instance, use {@link Builder}.
49 * </p>
50 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
51 *
52 * <pre>
53 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
54 * if (bomIn.hasBOM()) {
55 * // has a UTF-8 BOM
56 * }
57 * </pre>
58 *
59 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
60 *
61 * <pre>
62 * boolean include = true;
63 * BOMInputStream bomIn = BOMInputStream.builder()
64 * .setInputStream(in)
65 * .setInclude(include)
66 * .get();
67 * if (bomIn.hasBOM()) {
68 * // has a UTF-8 BOM
69 * }
70 * </pre>
71 *
72 * <h2>Example 3 - Detecting Multiple BOMs</h2>
73 *
74 * <pre>
75 * BOMInputStream bomIn = BOMInputStream.builder()
76 * .setInputStream(in)
77 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
78 * .get();
79 * if (bomIn.hasBOM() == false) {
80 * // No BOM found
81 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
82 * // has a UTF-16LE BOM
83 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
84 * // has a UTF-16BE BOM
85 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
86 * // has a UTF-32LE BOM
87 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
88 * // has a UTF-32BE BOM
89 * }
90 * </pre>
91 * <p>
92 * To build an instance, use {@link Builder}.
93 * </p>
94 * <p>
95 * This class is not thread-safe.
96 * </p>
97 *
98 * @see Builder
99 * @see org.apache.commons.io.ByteOrderMark
100 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
101 * @since 2.0
102 */
103 public class BOMInputStream extends ProxyInputStream {
104
105 // @formatter:off
106 /**
107 * Builds a new {@link BOMInputStream}.
108 *
109 * <h2>Using NIO</h2>
110 * <pre>{@code
111 * BOMInputStream s = BOMInputStream.builder()
112 * .setPath(Paths.get("MyFile.xml"))
113 * .setByteOrderMarks(ByteOrderMark.UTF_8)
114 * .setInclude(false)
115 * .get();}
116 * </pre>
117 * <h2>Using IO</h2>
118 * <pre>{@code
119 * BOMInputStream s = BOMInputStream.builder()
120 * .setFile(new File("MyFile.xml"))
121 * .setByteOrderMarks(ByteOrderMark.UTF_8)
122 * .setInclude(false)
123 * .get();}
124 * </pre>
125 *
126 * @see #get()
127 * @since 2.12.0
128 */
129 // @formatter:on
130 public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
131
132 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
133
134 /**
135 * For test access.
136 *
137 * @return the default byte order mark
138 */
139 static ByteOrderMark getDefaultByteOrderMark() {
140 return DEFAULT[0];
141 }
142
143 private ByteOrderMark[] byteOrderMarks = DEFAULT;
144
145 private boolean include;
146
147 /**
148 * Constructs a new builder of {@link BOMInputStream}.
149 */
150 public Builder() {
151 // empty
152 }
153
154 /**
155 * Builds a new {@link BOMInputStream}.
156 * <p>
157 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
158 * </p>
159 * <p>
160 * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
161 * </p>
162 * <p>
163 * This builder uses the following aspects:
164 * </p>
165 * <ul>
166 * <li>{@link #getInputStream()}</li>
167 * <li>include}</li>
168 * <li>byteOrderMarks</li>
169 * </ul>
170 *
171 * @return a new instance.
172 * @throws IllegalStateException if the {@code origin} is {@code null}.
173 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
174 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
175 * @see #getInputStream()
176 * @see #getUnchecked()
177 */
178 @Override
179 public BOMInputStream get() throws IOException {
180 return new BOMInputStream(this);
181 }
182
183 /**
184 * Sets the ByteOrderMarks to detect and optionally exclude.
185 * <p>
186 * The default is {@link ByteOrderMark#UTF_8}.
187 * </p>
188 *
189 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
190 * @return {@code this} instance.
191 */
192 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
193 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
194 return this;
195 }
196
197 /**
198 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
199 * <p>
200 * The default is false.
201 * </p>
202 *
203 * @param include true to include the UTF-8 BOM or false to exclude it. return this;
204 * @return {@code this} instance.
205 */
206 public Builder setInclude(final boolean include) {
207 this.include = include;
208 return this;
209 }
210
211 }
212
213 /**
214 * Compares ByteOrderMark objects in descending length order.
215 */
216 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
217
218 /**
219 * Constructs a new {@link Builder}.
220 *
221 * @return a new {@link Builder}.
222 * @since 2.12.0
223 */
224 public static Builder builder() {
225 return new Builder();
226 }
227
228 /**
229 * BOMs are sorted from longest to shortest.
230 */
231 private final List<ByteOrderMark> bomList;
232
233 private ByteOrderMark byteOrderMark;
234 private int fbIndex;
235 private int[] firstBytes;
236 private final boolean include;
237 private boolean markedAtStart;
238 private int markFbIndex;
239
240 private BOMInputStream(final Builder builder) throws IOException {
241 super(builder);
242 if (IOUtils.length(builder.byteOrderMarks) == 0) {
243 throw new IllegalArgumentException("No ByteOrderMark specified.");
244 }
245 this.include = builder.include;
246 final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
247 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
248 list.sort(ByteOrderMarkLengthComparator);
249 this.bomList = list;
250 }
251
252 /**
253 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
254 *
255 * @param delegate
256 * the InputStream to delegate to
257 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
258 */
259 @Deprecated
260 public BOMInputStream(final InputStream delegate) {
261 this(delegate, false, Builder.DEFAULT);
262 }
263
264 /**
265 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
266 *
267 * @param delegate
268 * the InputStream to delegate to
269 * @param include
270 * true to include the UTF-8 BOM or false to exclude it
271 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
272 */
273 @Deprecated
274 public BOMInputStream(final InputStream delegate, final boolean include) {
275 this(delegate, include, Builder.DEFAULT);
276 }
277
278 /**
279 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
280 *
281 * @param delegate
282 * the InputStream to delegate to
283 * @param include
284 * true to include the specified BOMs or false to exclude them
285 * @param boms
286 * The BOMs to detect and optionally exclude
287 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
288 */
289 @Deprecated
290 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
291 super(delegate);
292 if (IOUtils.length(boms) == 0) {
293 throw new IllegalArgumentException("No BOMs specified");
294 }
295 this.include = include;
296 final List<ByteOrderMark> list = Arrays.asList(boms);
297 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
298 list.sort(ByteOrderMarkLengthComparator);
299 this.bomList = list;
300 }
301
302 /**
303 * Constructs a new BOM InputStream that excludes the specified BOMs.
304 *
305 * @param delegate
306 * the InputStream to delegate to
307 * @param boms
308 * The BOMs to detect and exclude
309 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
310 */
311 @Deprecated
312 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
313 this(delegate, false, boms);
314 }
315
316 /**
317 * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
318 *
319 * @return The matched BOM or null if none matched.
320 */
321 private ByteOrderMark find() {
322 return bomList.stream().filter(this::matches).findFirst().orElse(null);
323 }
324
325 /**
326 * Gets the ByteOrderMark (Byte Order Mark).
327 *
328 * @return The BOM or null if none matched.
329 * @throws IOException
330 * if an error reading the first bytes of the stream occurs.
331 */
332 public ByteOrderMark getBOM() throws IOException {
333 if (firstBytes == null) {
334 byteOrderMark = readBom();
335 }
336 return byteOrderMark;
337 }
338
339 /**
340 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
341 *
342 * @return The BOM charset Name or null if no BOM found
343 * @throws IOException
344 * if an error reading the first bytes of the stream occurs
345 */
346 public String getBOMCharsetName() throws IOException {
347 getBOM();
348 return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
349 }
350
351 /**
352 * Tests whether the stream contains one of the specified BOMs.
353 *
354 * @return true if the stream has one of the specified BOMs, otherwise false if it does not
355 * @throws IOException
356 * if an error reading the first bytes of the stream occurs
357 */
358 public boolean hasBOM() throws IOException {
359 return getBOM() != null;
360 }
361
362 /**
363 * Tests whether the stream contains the specified BOM.
364 *
365 * @param bom
366 * The BOM to check for
367 * @return true if the stream has the specified BOM, otherwise false if it does not
368 * @throws IllegalArgumentException
369 * if the BOM is not one the stream is configured to detect
370 * @throws IOException
371 * if an error reading the first bytes of the stream occurs
372 */
373 public boolean hasBOM(final ByteOrderMark bom) throws IOException {
374 if (!bomList.contains(bom)) {
375 throw new IllegalArgumentException("Stream not configured to detect " + bom);
376 }
377 return Objects.equals(getBOM(), bom);
378 }
379
380 /**
381 * Invokes the delegate's {@code mark(int)} method.
382 *
383 * @param readLimit
384 * read ahead limit
385 */
386 @Override
387 public synchronized void mark(final int readLimit) {
388 markFbIndex = fbIndex;
389 markedAtStart = firstBytes == null;
390 in.mark(readLimit);
391 }
392
393 /**
394 * Checks if the bytes match a BOM.
395 *
396 * @param bom
397 * The BOM
398 * @return true if the bytes match the bom, otherwise false
399 */
400 private boolean matches(final ByteOrderMark bom) {
401 return bom.matches(firstBytes);
402 }
403
404 /**
405 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
406 *
407 * @return the byte read (excluding BOM) or -1 if the end of stream
408 * @throws IOException
409 * if an I/O error occurs
410 */
411 @Override
412 public int read() throws IOException {
413 checkOpen();
414 final int b = readFirstBytes();
415 return b >= 0 ? b : in.read();
416 }
417
418 /**
419 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
420 *
421 * @param buf
422 * the buffer to read the bytes into, never {@code null}
423 * @return the number of bytes read (excluding BOM) or -1 if the end of stream
424 * @throws NullPointerException
425 * if the buffer is {@code null}
426 * @throws IOException
427 * if an I/O error occurs
428 */
429 @Override
430 public int read(final byte[] buf) throws IOException {
431 return read(buf, 0, buf.length);
432 }
433
434 /**
435 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
436 *
437 * @param buf
438 * the buffer to read the bytes into
439 * @param off
440 * The start offset
441 * @param len
442 * The number of bytes to read (excluding BOM)
443 * @return the number of bytes read or -1 if the end of stream
444 * @throws NullPointerException
445 * if the buffer is {@code null}
446 * @throws IndexOutOfBoundsException
447 * if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code buf.length}
448 * @throws IOException
449 * if an I/O error occurs
450 */
451 @Override
452 public int read(final byte[] buf, int off, int len) throws IOException {
453 IOUtils.checkFromIndexSize(buf, off, len);
454 if (len == 0) {
455 return 0;
456 }
457 int firstCount = 0;
458 int b = 0;
459 while (len > 0 && b >= 0) {
460 b = readFirstBytes();
461 if (b >= 0) {
462 buf[off++] = (byte) (b & 0xFF);
463 len--;
464 firstCount++;
465 }
466 }
467 final int secondCount = in.read(buf, off, len);
468 afterRead(secondCount);
469 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
470 }
471
472 private ByteOrderMark readBom() throws IOException {
473 int fbLength = 0;
474 // BOMs are sorted from longest to shortest
475 final int maxBomSize = bomList.get(0).length();
476 final int[] tmp = new int[maxBomSize];
477 // Read first maxBomSize bytes
478 for (int i = 0; i < tmp.length; i++) {
479 tmp[i] = in.read();
480 afterRead(tmp[i]);
481 fbLength++;
482 if (tmp[i] < 0) {
483 break;
484 }
485 }
486 firstBytes = Arrays.copyOf(tmp, fbLength);
487 // match BOM in firstBytes
488 final ByteOrderMark bom = find();
489 if (bom != null && !include) {
490 if (bom.length() < firstBytes.length) {
491 fbIndex = bom.length();
492 } else {
493 firstBytes = new int[0];
494 }
495 }
496 return bom;
497 }
498
499 /**
500 * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a
501 * valid byte or -1 to indicate that the initial bytes have been processed already.
502 *
503 * @return the byte read (excluding BOM) or -1 if at the end of first bytes.
504 * @throws IOException if an I/O error occurs
505 */
506 private int readFirstBytes() throws IOException {
507 getBOM();
508 return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF;
509 }
510
511 /**
512 * Invokes the delegate's {@code reset()} method.
513 *
514 * @throws IOException
515 * if an I/O error occurs
516 */
517 @Override
518 public synchronized void reset() throws IOException {
519 fbIndex = markFbIndex;
520 if (markedAtStart) {
521 firstBytes = null;
522 }
523 in.reset();
524 }
525
526 /**
527 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
528 *
529 * @param n
530 * the number of bytes to skip
531 * @return the number of bytes to skipped or -1 if the end of stream
532 * @throws IOException
533 * if an I/O error occurs
534 */
535 @Override
536 public long skip(final long n) throws IOException {
537 int skipped = 0;
538 while (n > skipped && readFirstBytes() >= 0) {
539 skipped++;
540 }
541 return in.skip(n - skipped) + skipped;
542 }
543 }