1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.io.input;
19
20 import static org.apache.commons.io.IOUtils.EOF;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.Arrays;
25 import java.util.Comparator;
26 import java.util.List;
27 import java.util.Objects;
28
29 import org.apache.commons.io.ByteOrderMark;
30 import org.apache.commons.io.IOUtils;
31
32 /**
33 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
34 * <p>
35 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the first byte in the stream.
36 * </p>
37 * <p>
38 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
39 * </p>
40 * <ul>
41 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
42 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
43 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
44 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
45 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
46 * </ul>
47 * <p>
48 * To build an instance, use {@link Builder}.
49 * </p>
50 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
51 *
52 * <pre>
53 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
54 * if (bomIn.hasBOM()) {
55 * // has a UTF-8 BOM
56 * }
57 * </pre>
58 *
59 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
60 *
61 * <pre>
62 * boolean include = true;
63 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).setInclude(include).get();
64 * if (bomIn.hasBOM()) {
65 * // has a UTF-8 BOM
66 * }
67 * </pre>
68 *
69 * <h2>Example 3 - Detecting Multiple BOMs</h2>
70 *
71 * <pre>
72 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in)
73 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE).get();
74 * if (bomIn.hasBOM() == false) {
75 * // No BOM found
76 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
77 * // has a UTF-16LE BOM
78 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
79 * // has a UTF-16BE BOM
80 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
81 * // has a UTF-32LE BOM
82 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
83 * // has a UTF-32BE BOM
84 * }
85 * </pre>
86 * <p>
87 * To build an instance, use {@link Builder}.
88 * </p>
89 * <p>
90 * This class is not thread-safe.
91 * </p>
92 *
93 * @see Builder
94 * @see org.apache.commons.io.ByteOrderMark
95 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
96 * @since 2.0
97 */
98 public class BOMInputStream extends ProxyInputStream {
99
100 // @formatter:off
101 /**
102 * Builds a new {@link BOMInputStream}.
103 *
104 * <h2>Using NIO</h2>
105 * <pre>{@code
106 * BOMInputStream s = BOMInputStream.builder()
107 * .setPath(Paths.get("MyFile.xml"))
108 * .setByteOrderMarks(ByteOrderMark.UTF_8)
109 * .setInclude(false)
110 * .get();}
111 * </pre>
112 * <h2>Using IO</h2>
113 * <pre>{@code
114 * BOMInputStream s = BOMInputStream.builder()
115 * .setFile(new File("MyFile.xml"))
116 * .setByteOrderMarks(ByteOrderMark.UTF_8)
117 * .setInclude(false)
118 * .get();}
119 * </pre>
120 *
121 * @see #get()
122 * @since 2.12.0
123 */
124 // @formatter:on
125 public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
126
127 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
128
129 /**
130 * For test access.
131 *
132 * @return the default byte order mark.
133 */
134 static ByteOrderMark getDefaultByteOrderMark() {
135 return DEFAULT[0];
136 }
137
138 private ByteOrderMark[] byteOrderMarks = DEFAULT;
139 private boolean include;
140
141 /**
142 * Constructs a new builder of {@link BOMInputStream}.
143 */
144 public Builder() {
145 // empty
146 }
147
148 /**
149 * Builds a new {@link BOMInputStream}.
150 * <p>
151 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
152 * </p>
153 * <p>
154 * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
155 * </p>
156 * <p>
157 * This builder uses the following aspects:
158 * </p>
159 * <ul>
160 * <li>{@link #getInputStream()}</li>
161 * <li>include}</li>
162 * <li>byteOrderMarks</li>
163 * </ul>
164 *
165 * @return a new instance.
166 * @throws IllegalStateException if the {@code origin} is {@code null}.
167 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
168 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
169 * @see #getInputStream()
170 * @see #getUnchecked()
171 */
172 @Override
173 public BOMInputStream get() throws IOException {
174 return new BOMInputStream(this);
175 }
176
177 /**
178 * Sets the ByteOrderMarks to detect and optionally exclude.
179 * <p>
180 * The default is {@link ByteOrderMark#UTF_8}.
181 * </p>
182 *
183 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
184 * @return {@code this} instance.
185 */
186 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
187 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
188 return this;
189 }
190
191 /**
192 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
193 * <p>
194 * The default is false.
195 * </p>
196 *
197 * @param include true to include the UTF-8 BOM or false to exclude it. return this;.
198 * @return {@code this} instance.
199 */
200 public Builder setInclude(final boolean include) {
201 this.include = include;
202 return this;
203 }
204 }
205
206 /**
207 * Compares ByteOrderMark objects in descending length order.
208 */
209 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
210
211 /**
212 * Constructs a new {@link Builder}.
213 *
214 * @return a new {@link Builder}.
215 * @since 2.12.0
216 */
217 public static Builder builder() {
218 return new Builder();
219 }
220
221 /**
222 * BOMs are sorted from longest to shortest.
223 */
224 private final List<ByteOrderMark> bomList;
225 private final ByteOrderMark byteOrderMark;
226 private int fbIndex;
227 private int[] firstBytes;
228 private final boolean include;
229 private boolean markedAtStart;
230 private int markFbIndex;
231
232 /**
233 * Constructs a new instance.
234 *
235 * @param builder The builder.
236 * @throws IOException if an error reading the first bytes of the stream occurs.
237 */
238 private BOMInputStream(final Builder builder) throws IOException {
239 super(builder);
240 if (IOUtils.length(builder.byteOrderMarks) == 0) {
241 throw new IllegalArgumentException("No ByteOrderMark specified.");
242 }
243 this.include = builder.include;
244 final List<ByteOrderMark> bomList = Arrays.asList(builder.byteOrderMarks);
245 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
246 bomList.sort(ByteOrderMarkLengthComparator);
247 this.bomList = bomList;
248 this.byteOrderMark = readBom();
249 }
250
251 /**
252 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
253 *
254 * @param delegate the InputStream to delegate to.
255 * @throws IOException if an error reading the first bytes of the stream occurs.
256 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
257 */
258 @Deprecated
259 public BOMInputStream(final InputStream delegate) throws IOException {
260 this(delegate, false, Builder.DEFAULT);
261 }
262
263 /**
264 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
265 *
266 * @param delegate the InputStream to delegate to.
267 * @param include true to include the UTF-8 BOM or false to exclude it.
268 * @throws IOException if an error reading the first bytes of the stream occurs.
269 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
270 */
271 @Deprecated
272 public BOMInputStream(final InputStream delegate, final boolean include) throws IOException {
273 this(delegate, include, Builder.DEFAULT);
274 }
275
276 /**
277 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
278 *
279 * @param delegate the InputStream to delegate to.
280 * @param include true to include the specified BOMs or false to exclude them.
281 * @param boms The BOMs to detect and optionally exclude.
282 * @throws IOException if an error reading the first bytes of the stream occurs.
283 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
284 */
285 @Deprecated
286 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) throws IOException {
287 super(delegate);
288 if (IOUtils.length(boms) == 0) {
289 throw new IllegalArgumentException("No BOMs specified");
290 }
291 this.include = include;
292 final List<ByteOrderMark> list = Arrays.asList(boms);
293 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
294 list.sort(ByteOrderMarkLengthComparator);
295 this.bomList = list;
296 this.byteOrderMark = readBom();
297 }
298
299 /**
300 * Constructs a new BOM InputStream that excludes the specified BOMs.
301 *
302 * @param delegate the InputStream to delegate to.
303 * @param boms The BOMs to detect and exclude.
304 * @throws IOException if an error reading the first bytes of the stream occurs.
305 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
306 */
307 @Deprecated
308 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) throws IOException {
309 this(delegate, false, boms);
310 }
311
312 /**
313 * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
314 *
315 * @return The matched BOM or null if none matched.
316 */
317 private ByteOrderMark find() {
318 return bomList.stream().filter(this::matches).findFirst().orElse(null);
319 }
320
321 /**
322 * Gets the ByteOrderMark (Byte Order Mark).
323 *
324 * @return The BOM or null if none matched.
325 */
326 public ByteOrderMark getBOM() {
327 return byteOrderMark;
328 }
329
330 /**
331 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
332 *
333 * @return The BOM charset Name or null if no BOM found.
334 * @throws IOException if an error reading the first bytes of the stream occurs.
335 */
336 public String getBOMCharsetName() throws IOException {
337 return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
338 }
339
340 /**
341 * Tests whether the stream contains one of the specified BOMs.
342 *
343 * @return true if the stream has one of the specified BOMs, otherwise false if it does not.
344 * @throws IOException if an error reading the first bytes of the stream occurs.
345 */
346 public boolean hasBOM() throws IOException {
347 return getBOM() != null;
348 }
349
350 /**
351 * Tests whether the stream contains the specified BOM.
352 *
353 * @param bom The BOM to check for.
354 * @return true if the stream has the specified BOM, otherwise false if it does not.
355 * @throws IllegalArgumentException if the BOM is not one the stream is configured to detect.
356 * @throws IOException if an error reading the first bytes of the stream occurs.
357 */
358 public boolean hasBOM(final ByteOrderMark bom) throws IOException {
359 if (!bomList.contains(bom)) {
360 throw new IllegalArgumentException("Stream not configured to detect " + bom);
361 }
362 return Objects.equals(getBOM(), bom);
363 }
364
365 /**
366 * Invokes the delegate's {@link InputStream#mark(int)} method.
367 *
368 * @param readLimit read ahead limit.
369 */
370 @Override
371 public synchronized void mark(final int readLimit) {
372 markFbIndex = fbIndex;
373 markedAtStart = firstBytes == null;
374 in.mark(readLimit);
375 }
376
377 /**
378 * Checks if the bytes match a BOM.
379 *
380 * @param bom The BOM.
381 * @return true if the bytes match the BOM, otherwise false.
382 */
383 private boolean matches(final ByteOrderMark bom) {
384 return bom.matches(firstBytes);
385 }
386
387 /**
388 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
389 *
390 * @return the byte read (excluding BOM) or -1 if the end of stream.
391 * @throws IOException if an I/O error occurs.
392 */
393 @Override
394 public int read() throws IOException {
395 checkOpen();
396 final int b = readFirstBytes();
397 return b >= 0 ? b : in.read();
398 }
399
400 /**
401 * Invokes the delegate's {@link InputStream#read(byte[])} method, detecting and optionally skipping BOM.
402 *
403 * @param buf the buffer to read the bytes into, never {@code null}
404 * @return the number of bytes read (excluding BOM) or -1 if the end of stream.
405 * @throws NullPointerException if the buffer is {@code null}
406 * @throws IOException if an I/O error occurs.
407 */
408 @Override
409 public int read(final byte[] buf) throws IOException {
410 return read(buf, 0, buf.length);
411 }
412
413 /**
414 * Invokes the delegate's {@link InputStream#read(byte[], int, int)} method, detecting and optionally skipping BOM.
415 *
416 * @param buf the buffer to read the bytes into.
417 * @param off The start offset.
418 * @param len The number of bytes to read (excluding BOM).
419 * @return the number of bytes read or -1 if the end of stream.
420 * @throws NullPointerException if the buffer is {@code null}.
421 * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code buf.length}.
422 * @throws IOException if an I/O error occurs.
423 */
424 @Override
425 public int read(final byte[] buf, int off, int len) throws IOException {
426 IOUtils.checkFromIndexSize(buf, off, len);
427 if (len == 0) {
428 return 0;
429 }
430 int firstCount = 0;
431 int b = 0;
432 while (len > 0 && b >= 0) {
433 b = readFirstBytes();
434 if (b >= 0) {
435 buf[off++] = (byte) (b & 0xFF);
436 len--;
437 firstCount++;
438 }
439 }
440 final int secondCount = in.read(buf, off, len);
441 afterRead(secondCount);
442 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
443 }
444
445 /**
446 * Reads the byte order mark.
447 *
448 * @return the byte order mark.
449 * @throws IOException if an error reading the first bytes of the stream occurs.
450 */
451 private ByteOrderMark readBom() throws IOException {
452 int fbLength = 0;
453 // BOMs are sorted from longest to shortest
454 final int maxBomSize = bomList.get(0).length();
455 final int[] tmp = new int[maxBomSize];
456 // Read first maxBomSize bytes
457 for (int i = 0; i < tmp.length; i++) {
458 tmp[i] = in.read();
459 afterRead(tmp[i]);
460 fbLength++;
461 if (tmp[i] < 0) {
462 break;
463 }
464 }
465 firstBytes = Arrays.copyOf(tmp, fbLength);
466 // match BOM in firstBytes
467 final ByteOrderMark bom = find();
468 if (bom != null && !include) {
469 if (bom.length() < firstBytes.length) {
470 fbIndex = bom.length();
471 } else {
472 firstBytes = new int[0];
473 }
474 }
475 return bom;
476 }
477
478 /**
479 * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a
480 * valid byte or -1 to indicate that the initial bytes have been processed already.
481 *
482 * @return the byte read (excluding BOM) or -1 if at the end of first bytes.
483 * @throws IOException if an I/O error occurs.
484 */
485 private int readFirstBytes() throws IOException {
486 return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF;
487 }
488
489 /**
490 * Invokes the delegate's {@link InputStream#reset()} method.
491 *
492 * @throws IOException if an I/O error occurs.
493 */
494 @Override
495 public synchronized void reset() throws IOException {
496 fbIndex = markFbIndex;
497 if (markedAtStart) {
498 firstBytes = null;
499 }
500 in.reset();
501 }
502
503 /**
504 * Invokes the delegate's {@link InputStream#skip(long)} method, detecting and optionally skipping BOM.
505 *
506 * @param n the number of bytes to skip.
507 * @return the number of bytes to skipped or -1 if the end of stream.
508 * @throws IOException if an I/O error occurs.
509 */
510 @Override
511 public long skip(final long n) throws IOException {
512 int skipped = 0;
513 while (n > skipped && readFirstBytes() >= 0) {
514 skipped++;
515 }
516 return in.skip(n - skipped) + skipped;
517 }
518 }