View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.Arrays;
24  import java.util.Comparator;
25  import java.util.List;
26  import java.util.Objects;
27  
28  import org.apache.commons.io.ByteOrderMark;
29  import org.apache.commons.io.IOUtils;
30  
31  /**
32   * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
33   * <p>
34   * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
35   * first byte in the stream.
36   * </p>
37   * <p>
38   * The {@link ByteOrderMark} implementation has the following predefined BOMs:
39   * </p>
40   * <ul>
41   * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
42   * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
43   * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
44   * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
45   * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
46   * </ul>
47   * <p>
48   * To build an instance, use {@link Builder}.
49   * </p>
50   * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
51   *
52   * <pre>
53   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
54   * if (bomIn.hasBOM()) {
55   *     // has a UTF-8 BOM
56   * }
57   * </pre>
58   *
59   * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
60   *
61   * <pre>
62   * boolean include = true;
63   * BOMInputStream bomIn = BOMInputStream.builder()
64   *     .setInputStream(in)
65   *     .setInclude(include)
66   *     .get();
67   * if (bomIn.hasBOM()) {
68   *     // has a UTF-8 BOM
69   * }
70   * </pre>
71   *
72   * <h2>Example 3 - Detecting Multiple BOMs</h2>
73   *
74   * <pre>
75   * BOMInputStream bomIn = BOMInputStream.builder()
76   *   .setInputStream(in)
77   *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
78   *   .get();
79   * if (bomIn.hasBOM() == false) {
80   *     // No BOM found
81   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
82   *     // has a UTF-16LE BOM
83   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
84   *     // has a UTF-16BE BOM
85   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
86   *     // has a UTF-32LE BOM
87   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
88   *     // has a UTF-32BE BOM
89   * }
90   * </pre>
91   * <p>
92   * To build an instance, use {@link Builder}.
93   * </p>
94   * <p>
95   * This class is not thread-safe.
96   * </p>
97   *
98   * @see Builder
99   * @see org.apache.commons.io.ByteOrderMark
100  * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
101  * @since 2.0
102  */
103 public class BOMInputStream extends ProxyInputStream {
104 
105     // @formatter:off
106     /**
107      * Builds a new {@link BOMInputStream}.
108      *
109      * <h2>Using NIO</h2>
110      * <pre>{@code
111      * BOMInputStream s = BOMInputStream.builder()
112      *   .setPath(Paths.get("MyFile.xml"))
113      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
114      *   .setInclude(false)
115      *   .get();}
116      * </pre>
117      * <h2>Using IO</h2>
118      * <pre>{@code
119      * BOMInputStream s = BOMInputStream.builder()
120      *   .setFile(new File("MyFile.xml"))
121      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
122      *   .setInclude(false)
123      *   .get();}
124      * </pre>
125      *
126      * @see #get()
127      * @since 2.12.0
128      */
129     // @formatter:on
130     public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
131 
132         private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
133 
134         /**
135          * For test access.
136          *
137          * @return the default byte order mark
138          */
139         static ByteOrderMark getDefaultByteOrderMark() {
140             return DEFAULT[0];
141         }
142 
143         private ByteOrderMark[] byteOrderMarks = DEFAULT;
144 
145         private boolean include;
146 
147         /**
148          * Constructs a new builder of {@link BOMInputStream}.
149          */
150         public Builder() {
151             // empty
152         }
153 
154         /**
155          * Builds a new {@link BOMInputStream}.
156          * <p>
157          * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
158          * </p>
159          * <p>
160          * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
161          * </p>
162          * <p>
163          * This builder uses the following aspects:
164          * </p>
165          * <ul>
166          * <li>{@link #getInputStream()}</li>
167          * <li>include}</li>
168          * <li>byteOrderMarks</li>
169          * </ul>
170          *
171          * @return a new instance.
172          * @throws IllegalStateException         if the {@code origin} is {@code null}.
173          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
174          * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
175          * @see #getInputStream()
176          * @see #getUnchecked()
177          */
178         @Override
179         public BOMInputStream get() throws IOException {
180             return new BOMInputStream(this);
181         }
182 
183         /**
184          * Sets the ByteOrderMarks to detect and optionally exclude.
185          * <p>
186          * The default is {@link ByteOrderMark#UTF_8}.
187          * </p>
188          *
189          * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
190          * @return {@code this} instance.
191          */
192         public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
193             this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
194             return this;
195         }
196 
197         /**
198          * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
199          * <p>
200          * The default is false.
201          * </p>
202          *
203          * @param include true to include the UTF-8 BOM or false to exclude it. return this;
204          * @return {@code this} instance.
205          */
206         public Builder setInclude(final boolean include) {
207             this.include = include;
208             return this;
209         }
210 
211     }
212 
213     /**
214      * Compares ByteOrderMark objects in descending length order.
215      */
216     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
217 
218     /**
219      * Constructs a new {@link Builder}.
220      *
221      * @return a new {@link Builder}.
222      * @since 2.12.0
223      */
224     public static Builder builder() {
225         return new Builder();
226     }
227 
228     /**
229      * BOMs are sorted from longest to shortest.
230      */
231     private final List<ByteOrderMark> bomList;
232 
233     private ByteOrderMark byteOrderMark;
234     private int fbIndex;
235     private int[] firstBytes;
236     private final boolean include;
237     private boolean markedAtStart;
238     private int markFbIndex;
239 
240     private BOMInputStream(final Builder builder) throws IOException {
241         super(builder);
242         if (IOUtils.length(builder.byteOrderMarks) == 0) {
243             throw new IllegalArgumentException("No ByteOrderMark specified.");
244         }
245         this.include = builder.include;
246         final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
247         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
248         list.sort(ByteOrderMarkLengthComparator);
249         this.bomList = list;
250     }
251 
252     /**
253      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
254      *
255      * @param delegate
256      *            the InputStream to delegate to
257      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
258      */
259     @Deprecated
260     public BOMInputStream(final InputStream delegate) {
261         this(delegate, false, Builder.DEFAULT);
262     }
263 
264     /**
265      * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
266      *
267      * @param delegate
268      *            the InputStream to delegate to
269      * @param include
270      *            true to include the UTF-8 BOM or false to exclude it
271      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
272      */
273     @Deprecated
274     public BOMInputStream(final InputStream delegate, final boolean include) {
275         this(delegate, include, Builder.DEFAULT);
276     }
277 
278     /**
279      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
280      *
281      * @param delegate
282      *            the InputStream to delegate to
283      * @param include
284      *            true to include the specified BOMs or false to exclude them
285      * @param boms
286      *            The BOMs to detect and optionally exclude
287      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
288      */
289     @Deprecated
290     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
291         super(delegate);
292         if (IOUtils.length(boms) == 0) {
293             throw new IllegalArgumentException("No BOMs specified");
294         }
295         this.include = include;
296         final List<ByteOrderMark> list = Arrays.asList(boms);
297         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
298         list.sort(ByteOrderMarkLengthComparator);
299         this.bomList = list;
300     }
301 
302     /**
303      * Constructs a new BOM InputStream that excludes the specified BOMs.
304      *
305      * @param delegate
306      *            the InputStream to delegate to
307      * @param boms
308      *            The BOMs to detect and exclude
309      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
310      */
311     @Deprecated
312     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
313         this(delegate, false, boms);
314     }
315 
316     /**
317      * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
318      *
319      * @return The matched BOM or null if none matched.
320      */
321     private ByteOrderMark find() {
322         return bomList.stream().filter(this::matches).findFirst().orElse(null);
323     }
324 
325     /**
326      * Gets the ByteOrderMark (Byte Order Mark).
327      *
328      * @return The BOM or null if none matched.
329      * @throws IOException
330      *             if an error reading the first bytes of the stream occurs.
331      */
332     public ByteOrderMark getBOM() throws IOException {
333         if (firstBytes == null) {
334             byteOrderMark = readBom();
335         }
336         return byteOrderMark;
337     }
338 
339     /**
340      * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
341      *
342      * @return The BOM charset Name or null if no BOM found
343      * @throws IOException
344      *             if an error reading the first bytes of the stream occurs
345      */
346     public String getBOMCharsetName() throws IOException {
347         getBOM();
348         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
349     }
350 
351     /**
352      * Tests whether the stream contains one of the specified BOMs.
353      *
354      * @return true if the stream has one of the specified BOMs, otherwise false if it does not
355      * @throws IOException
356      *             if an error reading the first bytes of the stream occurs
357      */
358     public boolean hasBOM() throws IOException {
359         return getBOM() != null;
360     }
361 
362     /**
363      * Tests whether the stream contains the specified BOM.
364      *
365      * @param bom
366      *            The BOM to check for
367      * @return true if the stream has the specified BOM, otherwise false if it does not
368      * @throws IllegalArgumentException
369      *             if the BOM is not one the stream is configured to detect
370      * @throws IOException
371      *             if an error reading the first bytes of the stream occurs
372      */
373     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
374         if (!bomList.contains(bom)) {
375             throw new IllegalArgumentException("Stream not configured to detect " + bom);
376         }
377         return Objects.equals(getBOM(), bom);
378     }
379 
380     /**
381      * Invokes the delegate's {@code mark(int)} method.
382      *
383      * @param readLimit
384      *            read ahead limit
385      */
386     @Override
387     public synchronized void mark(final int readLimit) {
388         markFbIndex = fbIndex;
389         markedAtStart = firstBytes == null;
390         in.mark(readLimit);
391     }
392 
393     /**
394      * Checks if the bytes match a BOM.
395      *
396      * @param bom
397      *            The BOM
398      * @return true if the bytes match the bom, otherwise false
399      */
400     private boolean matches(final ByteOrderMark bom) {
401         return bom.matches(firstBytes);
402     }
403 
404     /**
405      * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
406      *
407      * @return the byte read (excluding BOM) or -1 if the end of stream
408      * @throws IOException
409      *             if an I/O error occurs
410      */
411     @Override
412     public int read() throws IOException {
413         checkOpen();
414         final int b = readFirstBytes();
415         return b >= 0 ? b : in.read();
416     }
417 
418     /**
419      * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
420      *
421      * @param buf
422      *            the buffer to read the bytes into
423      * @return the number of bytes read (excluding BOM) or -1 if the end of stream
424      * @throws IOException
425      *             if an I/O error occurs
426      */
427     @Override
428     public int read(final byte[] buf) throws IOException {
429         return read(buf, 0, buf.length);
430     }
431 
432     /**
433      * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
434      *
435      * @param buf
436      *            the buffer to read the bytes into
437      * @param off
438      *            The start offset
439      * @param len
440      *            The number of bytes to read (excluding BOM)
441      * @return the number of bytes read or -1 if the end of stream
442      * @throws IOException
443      *             if an I/O error occurs
444      */
445     @Override
446     public int read(final byte[] buf, int off, int len) throws IOException {
447         int firstCount = 0;
448         int b = 0;
449         while (len > 0 && b >= 0) {
450             b = readFirstBytes();
451             if (b >= 0) {
452                 buf[off++] = (byte) (b & 0xFF);
453                 len--;
454                 firstCount++;
455             }
456         }
457         final int secondCount = in.read(buf, off, len);
458         afterRead(secondCount);
459         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
460     }
461 
462     private ByteOrderMark readBom() throws IOException {
463         int fbLength = 0;
464         // BOMs are sorted from longest to shortest
465         final int maxBomSize = bomList.get(0).length();
466         final int[] tmp = new int[maxBomSize];
467         // Read first maxBomSize bytes
468         for (int i = 0; i < tmp.length; i++) {
469             tmp[i] = in.read();
470             afterRead(tmp[i]);
471             fbLength++;
472             if (tmp[i] < 0) {
473                 break;
474             }
475         }
476         firstBytes = Arrays.copyOf(tmp, fbLength);
477         // match BOM in firstBytes
478         final ByteOrderMark bom = find();
479         if (bom != null && !include) {
480             if (bom.length() < firstBytes.length) {
481                 fbIndex = bom.length();
482             } else {
483                 firstBytes = new int[0];
484             }
485         }
486         return bom;
487     }
488 
489     /**
490      * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a
491      * valid byte or -1 to indicate that the initial bytes have been processed already.
492      *
493      * @return the byte read (excluding BOM) or -1 if at the end of first bytes.
494      * @throws IOException if an I/O error occurs
495      */
496     private int readFirstBytes() throws IOException {
497         getBOM();
498         return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF;
499     }
500 
501     /**
502      * Invokes the delegate's {@code reset()} method.
503      *
504      * @throws IOException
505      *             if an I/O error occurs
506      */
507     @Override
508     public synchronized void reset() throws IOException {
509         fbIndex = markFbIndex;
510         if (markedAtStart) {
511             firstBytes = null;
512         }
513         in.reset();
514     }
515 
516     /**
517      * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
518      *
519      * @param n
520      *            the number of bytes to skip
521      * @return the number of bytes to skipped or -1 if the end of stream
522      * @throws IOException
523      *             if an I/O error occurs
524      */
525     @Override
526     public long skip(final long n) throws IOException {
527         int skipped = 0;
528         while (n > skipped && readFirstBytes() >= 0) {
529             skipped++;
530         }
531         return in.skip(n - skipped) + skipped;
532     }
533 }