View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.Arrays;
24  import java.util.Comparator;
25  import java.util.List;
26  import java.util.Objects;
27  
28  import org.apache.commons.io.ByteOrderMark;
29  import org.apache.commons.io.IOUtils;
30  
31  /**
32   * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
33   * <p>
34   * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
35   * first byte in the stream.
36   * </p>
37   * <p>
38   * The {@link ByteOrderMark} implementation has the following predefined BOMs:
39   * </p>
40   * <ul>
41   * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
42   * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
43   * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
44   * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
45   * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
46   * </ul>
47   * <p>
48   * To build an instance, use {@link Builder}.
49   * </p>
50   * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
51   *
52   * <pre>
53   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
54   * if (bomIn.hasBOM()) {
55   *     // has a UTF-8 BOM
56   * }
57   * </pre>
58   *
59   * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
60   *
61   * <pre>
62   * boolean include = true;
63   * BOMInputStream bomIn = BOMInputStream.builder()
64   *     .setInputStream(in)
65   *     .setInclude(include)
66   *     .get();
67   * if (bomIn.hasBOM()) {
68   *     // has a UTF-8 BOM
69   * }
70   * </pre>
71   *
72   * <h2>Example 3 - Detecting Multiple BOMs</h2>
73   *
74   * <pre>
75   * BOMInputStream bomIn = BOMInputStream.builder()
76   *   .setInputStream(in)
77   *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
78   *   .get();
79   * if (bomIn.hasBOM() == false) {
80   *     // No BOM found
81   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
82   *     // has a UTF-16LE BOM
83   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
84   *     // has a UTF-16BE BOM
85   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
86   *     // has a UTF-32LE BOM
87   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
88   *     // has a UTF-32BE BOM
89   * }
90   * </pre>
91   * <p>
92   * To build an instance, use {@link Builder}.
93   * </p>
94   *
95   * @see Builder
96   * @see org.apache.commons.io.ByteOrderMark
97   * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
98   * @since 2.0
99   */
100 public class BOMInputStream extends ProxyInputStream {
101 
102     // @formatter:off
103     /**
104      * Builds a new {@link BOMInputStream}.
105      *
106      * <h2>Using NIO</h2>
107      * <pre>{@code
108      * BOMInputStream s = BOMInputStream.builder()
109      *   .setPath(Paths.get("MyFile.xml"))
110      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
111      *   .setInclude(false)
112      *   .get();}
113      * </pre>
114      * <h2>Using IO</h2>
115      * <pre>{@code
116      * BOMInputStream s = BOMInputStream.builder()
117      *   .setFile(new File("MyFile.xml"))
118      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
119      *   .setInclude(false)
120      *   .get();}
121      * </pre>
122      *
123      * @see #get()
124      * @since 2.12.0
125      */
126     // @formatter:on
127     public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
128 
129         private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
130 
131         /**
132          * For test access.
133          *
134          * @return the default byte order mark
135          */
136         static ByteOrderMark getDefaultByteOrderMark() {
137             return DEFAULT[0];
138         }
139 
140         private ByteOrderMark[] byteOrderMarks = DEFAULT;
141 
142         private boolean include;
143 
144         /**
145          * Constructs a new builder of {@link BOMInputStream}.
146          */
147         public Builder() {
148             // empty
149         }
150 
151         /**
152          * Builds a new {@link BOMInputStream}.
153          * <p>
154          * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
155          * </p>
156          * <p>
157          * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
158          * </p>
159          * <p>
160          * This builder uses the following aspects:
161          * </p>
162          * <ul>
163          * <li>{@link #getInputStream()}</li>
164          * <li>include}</li>
165          * <li>byteOrderMarks</li>
166          * </ul>
167          *
168          * @return a new instance.
169          * @throws IllegalStateException         if the {@code origin} is {@code null}.
170          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
171          * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
172          * @see #getInputStream()
173          * @see #getUnchecked()
174          */
175         @Override
176         public BOMInputStream get() throws IOException {
177             return new BOMInputStream(this);
178         }
179 
180         /**
181          * Sets the ByteOrderMarks to detect and optionally exclude.
182          * <p>
183          * The default is {@link ByteOrderMark#UTF_8}.
184          * </p>
185          *
186          * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
187          * @return {@code this} instance.
188          */
189         public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
190             this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
191             return this;
192         }
193 
194         /**
195          * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
196          * <p>
197          * The default is false.
198          * </p>
199          *
200          * @param include true to include the UTF-8 BOM or false to exclude it. return this;
201          * @return {@code this} instance.
202          */
203         public Builder setInclude(final boolean include) {
204             this.include = include;
205             return this;
206         }
207 
208     }
209 
210     /**
211      * Compares ByteOrderMark objects in descending length order.
212      */
213     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
214 
215     /**
216      * Constructs a new {@link Builder}.
217      *
218      * @return a new {@link Builder}.
219      * @since 2.12.0
220      */
221     public static Builder builder() {
222         return new Builder();
223     }
224 
225     /**
226      * BOMs are sorted from longest to shortest.
227      */
228     private final List<ByteOrderMark> bomList;
229 
230     private ByteOrderMark byteOrderMark;
231     private int fbIndex;
232     private int fbLength;
233     private int[] firstBytes;
234     private final boolean include;
235     private boolean markedAtStart;
236     private int markFbIndex;
237 
238     private BOMInputStream(final Builder builder) throws IOException {
239         super(builder);
240         if (IOUtils.length(builder.byteOrderMarks) == 0) {
241             throw new IllegalArgumentException("No ByteOrderMark specified.");
242         }
243         this.include = builder.include;
244         final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
245         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
246         list.sort(ByteOrderMarkLengthComparator);
247         this.bomList = list;
248     }
249 
250     /**
251      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
252      *
253      * @param delegate
254      *            the InputStream to delegate to
255      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
256      */
257     @Deprecated
258     public BOMInputStream(final InputStream delegate) {
259         this(delegate, false, Builder.DEFAULT);
260     }
261 
262     /**
263      * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
264      *
265      * @param delegate
266      *            the InputStream to delegate to
267      * @param include
268      *            true to include the UTF-8 BOM or false to exclude it
269      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
270      */
271     @Deprecated
272     public BOMInputStream(final InputStream delegate, final boolean include) {
273         this(delegate, include, Builder.DEFAULT);
274     }
275 
276     /**
277      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
278      *
279      * @param delegate
280      *            the InputStream to delegate to
281      * @param include
282      *            true to include the specified BOMs or false to exclude them
283      * @param boms
284      *            The BOMs to detect and optionally exclude
285      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
286      */
287     @Deprecated
288     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
289         super(delegate);
290         if (IOUtils.length(boms) == 0) {
291             throw new IllegalArgumentException("No BOMs specified");
292         }
293         this.include = include;
294         final List<ByteOrderMark> list = Arrays.asList(boms);
295         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
296         list.sort(ByteOrderMarkLengthComparator);
297         this.bomList = list;
298     }
299 
300     /**
301      * Constructs a new BOM InputStream that excludes the specified BOMs.
302      *
303      * @param delegate
304      *            the InputStream to delegate to
305      * @param boms
306      *            The BOMs to detect and exclude
307      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
308      */
309     @Deprecated
310     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
311         this(delegate, false, boms);
312     }
313 
314     /**
315      * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
316      *
317      * @return The matched BOM or null if none matched.
318      */
319     private ByteOrderMark find() {
320         return bomList.stream().filter(this::matches).findFirst().orElse(null);
321     }
322 
323     /**
324      * Gets the ByteOrderMark (Byte Order Mark).
325      *
326      * @return The BOM or null if none matched.
327      * @throws IOException
328      *             if an error reading the first bytes of the stream occurs.
329      */
330     public ByteOrderMark getBOM() throws IOException {
331         if (firstBytes == null) {
332             byteOrderMark = readBom();
333         }
334         return byteOrderMark;
335     }
336 
337     /**
338      * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
339      *
340      * @return The BOM charset Name or null if no BOM found
341      * @throws IOException
342      *             if an error reading the first bytes of the stream occurs
343      */
344     public String getBOMCharsetName() throws IOException {
345         getBOM();
346         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
347     }
348 
349     /**
350      * Tests whether the stream contains one of the specified BOMs.
351      *
352      * @return true if the stream has one of the specified BOMs, otherwise false if it does not
353      * @throws IOException
354      *             if an error reading the first bytes of the stream occurs
355      */
356     public boolean hasBOM() throws IOException {
357         return getBOM() != null;
358     }
359 
360     /**
361      * Tests whether the stream contains the specified BOM.
362      *
363      * @param bom
364      *            The BOM to check for
365      * @return true if the stream has the specified BOM, otherwise false if it does not
366      * @throws IllegalArgumentException
367      *             if the BOM is not one the stream is configured to detect
368      * @throws IOException
369      *             if an error reading the first bytes of the stream occurs
370      */
371     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
372         if (!bomList.contains(bom)) {
373             throw new IllegalArgumentException("Stream not configured to detect " + bom);
374         }
375         return Objects.equals(getBOM(), bom);
376     }
377 
378     /**
379      * Invokes the delegate's {@code mark(int)} method.
380      *
381      * @param readLimit
382      *            read ahead limit
383      */
384     @Override
385     public synchronized void mark(final int readLimit) {
386         markFbIndex = fbIndex;
387         markedAtStart = firstBytes == null;
388         in.mark(readLimit);
389     }
390 
391     /**
392      * Checks if the bytes match a BOM.
393      *
394      * @param bom
395      *            The BOM
396      * @return true if the bytes match the bom, otherwise false
397      */
398     private boolean matches(final ByteOrderMark bom) {
399         return bom.matches(firstBytes);
400     }
401 
402     /**
403      * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
404      *
405      * @return the byte read (excluding BOM) or -1 if the end of stream
406      * @throws IOException
407      *             if an I/O error occurs
408      */
409     @Override
410     public int read() throws IOException {
411         checkOpen();
412         final int b = readFirstBytes();
413         return b >= 0 ? b : in.read();
414     }
415 
416     /**
417      * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
418      *
419      * @param buf
420      *            the buffer to read the bytes into
421      * @return the number of bytes read (excluding BOM) or -1 if the end of stream
422      * @throws IOException
423      *             if an I/O error occurs
424      */
425     @Override
426     public int read(final byte[] buf) throws IOException {
427         return read(buf, 0, buf.length);
428     }
429 
430     /**
431      * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
432      *
433      * @param buf
434      *            the buffer to read the bytes into
435      * @param off
436      *            The start offset
437      * @param len
438      *            The number of bytes to read (excluding BOM)
439      * @return the number of bytes read or -1 if the end of stream
440      * @throws IOException
441      *             if an I/O error occurs
442      */
443     @Override
444     public int read(final byte[] buf, int off, int len) throws IOException {
445         int firstCount = 0;
446         int b = 0;
447         while (len > 0 && b >= 0) {
448             b = readFirstBytes();
449             if (b >= 0) {
450                 buf[off++] = (byte) (b & 0xFF);
451                 len--;
452                 firstCount++;
453             }
454         }
455         final int secondCount = in.read(buf, off, len);
456         afterRead(secondCount);
457         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
458     }
459 
460     private ByteOrderMark readBom() throws IOException {
461         fbLength = 0;
462         // BOMs are sorted from longest to shortest
463         final int maxBomSize = bomList.get(0).length();
464         firstBytes = new int[maxBomSize];
465         // Read first maxBomSize bytes
466         for (int i = 0; i < firstBytes.length; i++) {
467             firstBytes[i] = in.read();
468             afterRead(firstBytes[i]);
469             fbLength++;
470             if (firstBytes[i] < 0) {
471                 break;
472             }
473         }
474         // match BOM in firstBytes
475         final ByteOrderMark bom = find();
476         if (bom != null && !include) {
477             if (bom.length() < firstBytes.length) {
478                 fbIndex = bom.length();
479             } else {
480                 fbLength = 0;
481             }
482         }
483         return bom;
484     }
485 
486     /**
487      * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
488      * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
489      * processed already.
490      *
491      * @return the byte read (excluding BOM) or -1 if the end of stream
492      * @throws IOException
493      *             if an I/O error occurs
494      */
495     private int readFirstBytes() throws IOException {
496         getBOM();
497         return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
498     }
499 
500     /**
501      * Invokes the delegate's {@code reset()} method.
502      *
503      * @throws IOException
504      *             if an I/O error occurs
505      */
506     @Override
507     public synchronized void reset() throws IOException {
508         fbIndex = markFbIndex;
509         if (markedAtStart) {
510             firstBytes = null;
511         }
512         in.reset();
513     }
514 
515     /**
516      * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
517      *
518      * @param n
519      *            the number of bytes to skip
520      * @return the number of bytes to skipped or -1 if the end of stream
521      * @throws IOException
522      *             if an I/O error occurs
523      */
524     @Override
525     public long skip(final long n) throws IOException {
526         int skipped = 0;
527         while (n > skipped && readFirstBytes() >= 0) {
528             skipped++;
529         }
530         return in.skip(n - skipped) + skipped;
531     }
532 }