View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.Arrays;
24  import java.util.Comparator;
25  import java.util.List;
26  import java.util.Objects;
27  
28  import org.apache.commons.io.ByteOrderMark;
29  import org.apache.commons.io.IOUtils;
30  import org.apache.commons.io.build.AbstractStreamBuilder;
31  
32  /**
33   * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
34   * <p>
35   * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
36   * first byte in the stream.
37   * </p>
38   * <p>
39   * The {@link ByteOrderMark} implementation has the following predefined BOMs:
40   * </p>
41   * <ul>
42   * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
43   * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
44   * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
45   * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
46   * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
47   * </ul>
48   * <p>
49   * To build an instance, use {@link Builder}.
50   * </p>
51   * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
52   *
53   * <pre>
54   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
55   * if (bomIn.hasBOM()) {
56   *     // has a UTF-8 BOM
57   * }
58   * </pre>
59   *
60   * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
61   *
62   * <pre>
63   * boolean include = true;
64   * BOMInputStream bomIn = BOMInputStream.builder()
65   *     .setInputStream(in)
66   *     .setInclude(include)
67   *     .get();
68   * if (bomIn.hasBOM()) {
69   *     // has a UTF-8 BOM
70   * }
71   * </pre>
72   *
73   * <h2>Example 3 - Detecting Multiple BOMs</h2>
74   *
75   * <pre>
76   * BOMInputStream bomIn = BOMInputStream.builder()
77   *   .setInputStream(in)
78   *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
79   *   .get();
80   * if (bomIn.hasBOM() == false) {
81   *     // No BOM found
82   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
83   *     // has a UTF-16LE BOM
84   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
85   *     // has a UTF-16BE BOM
86   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
87   *     // has a UTF-32LE BOM
88   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
89   *     // has a UTF-32BE BOM
90   * }
91   * </pre>
92   * <p>
93   * To build an instance, use {@link Builder}.
94   * </p>
95   *
96   * @see Builder
97   * @see org.apache.commons.io.ByteOrderMark
98   * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
99   * @since 2.0
100  */
101 public class BOMInputStream extends ProxyInputStream {
102 
103     // @formatter:off
104     /**
105      * Builds a new {@link BOMInputStream}.
106      *
107      * <h2>Using NIO</h2>
108      * <pre>{@code
109      * BOMInputStream s = BOMInputStream.builder()
110      *   .setPath(Paths.get("MyFile.xml"))
111      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
112      *   .setInclude(false)
113      *   .get();}
114      * </pre>
115      * <h2>Using IO</h2>
116      * <pre>{@code
117      * BOMInputStream s = BOMInputStream.builder()
118      *   .setFile(new File("MyFile.xml"))
119      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
120      *   .setInclude(false)
121      *   .get();}
122      * </pre>
123      *
124      * @see #get()
125      * @since 2.12.0
126      */
127     // @formatter:on
128     public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> {
129 
130         private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
131 
132         /**
133          * For test access.
134          *
135          * @return the default byte order mark
136          */
137         static ByteOrderMark getDefaultByteOrderMark() {
138             return DEFAULT[0];
139         }
140 
141         private ByteOrderMark[] byteOrderMarks = DEFAULT;
142 
143         private boolean include;
144 
145         /**
146          * Builds a new {@link BOMInputStream}.
147          * <p>
148          * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception.
149          * </p>
150          * <p>
151          * This builder use the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
152          * </p>
153          * <p>
154          * This builder use the following aspects:
155          * </p>
156          * <ul>
157          * <li>{@link #getInputStream()}</li>
158          * <li>include}</li>
159          * <li>byteOrderMarks</li>
160          * </ul>
161          *
162          * @return a new instance.
163          * @throws IllegalStateException         if the {@code origin} is {@code null}.
164          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
165          * @throws IOException                   if an I/O error occurs.
166          * @see #getInputStream()
167          */
168         @SuppressWarnings("resource")
169         @Override
170         public BOMInputStream get() throws IOException {
171             return new BOMInputStream(getInputStream(), include, byteOrderMarks);
172         }
173 
174         /**
175          * Sets the ByteOrderMarks to detect and optionally exclude.
176          * <p>
177          * The default is {@link ByteOrderMark#UTF_8}.
178          * </p>
179          *
180          * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
181          * @return this
182          */
183         public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
184             this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
185             return this;
186         }
187 
188         /**
189          * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
190          * <p>
191          * The default is false.
192          * </p>
193          *
194          * @param include true to include the UTF-8 BOM or false to exclude it. return this;
195          * @return this
196          */
197         public Builder setInclude(final boolean include) {
198             this.include = include;
199             return this;
200         }
201 
202     }
203 
204     /**
205      * Compares ByteOrderMark objects in descending length order.
206      */
207     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
208 
209     /**
210      * Constructs a new {@link Builder}.
211      *
212      * @return a new {@link Builder}.
213      * @since 2.12.0
214      */
215     public static Builder builder() {
216         return new Builder();
217     }
218 
219     /**
220      * BOMs are sorted from longest to shortest.
221      */
222     private final List<ByteOrderMark> boms;
223 
224     private ByteOrderMark byteOrderMark;
225     private int fbIndex;
226     private int fbLength;
227     private int[] firstBytes;
228     private final boolean include;
229     private boolean markedAtStart;
230     private int markFbIndex;
231 
232     /**
233      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
234      *
235      * @param delegate
236      *            the InputStream to delegate to
237      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
238      */
239     @Deprecated
240     public BOMInputStream(final InputStream delegate) {
241         this(delegate, false, Builder.DEFAULT);
242     }
243 
244     /**
245      * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
246      *
247      * @param delegate
248      *            the InputStream to delegate to
249      * @param include
250      *            true to include the UTF-8 BOM or false to exclude it
251      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
252      */
253     @Deprecated
254     public BOMInputStream(final InputStream delegate, final boolean include) {
255         this(delegate, include, Builder.DEFAULT);
256     }
257 
258     /**
259      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
260      *
261      * @param delegate
262      *            the InputStream to delegate to
263      * @param include
264      *            true to include the specified BOMs or false to exclude them
265      * @param boms
266      *            The BOMs to detect and optionally exclude
267      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
268      */
269     @Deprecated
270     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
271         super(delegate);
272         if (IOUtils.length(boms) == 0) {
273             throw new IllegalArgumentException("No BOMs specified");
274         }
275         this.include = include;
276         final List<ByteOrderMark> list = Arrays.asList(boms);
277         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
278         list.sort(ByteOrderMarkLengthComparator);
279         this.boms = list;
280     }
281 
282     /**
283      * Constructs a new BOM InputStream that excludes the specified BOMs.
284      *
285      * @param delegate
286      *            the InputStream to delegate to
287      * @param boms
288      *            The BOMs to detect and exclude
289      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
290      */
291     @Deprecated
292     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
293         this(delegate, false, boms);
294     }
295 
296     /**
297      * Find a BOM with the specified bytes.
298      *
299      * @return The matched BOM or null if none matched
300      */
301     private ByteOrderMark find() {
302         return boms.stream().filter(this::matches).findFirst().orElse(null);
303     }
304 
305     /**
306      * Gets the BOM (Byte Order Mark).
307      *
308      * @return The BOM or null if none
309      * @throws IOException
310      *             if an error reading the first bytes of the stream occurs
311      */
312     public ByteOrderMark getBOM() throws IOException {
313         if (firstBytes == null) {
314             fbLength = 0;
315             // BOMs are sorted from longest to shortest
316             final int maxBomSize = boms.get(0).length();
317             firstBytes = new int[maxBomSize];
318             // Read first maxBomSize bytes
319             for (int i = 0; i < firstBytes.length; i++) {
320                 firstBytes[i] = in.read();
321                 fbLength++;
322                 if (firstBytes[i] < 0) {
323                     break;
324                 }
325             }
326             // match BOM in firstBytes
327             byteOrderMark = find();
328             if (byteOrderMark != null && !include) {
329                 if (byteOrderMark.length() < firstBytes.length) {
330                     fbIndex = byteOrderMark.length();
331                 } else {
332                     fbLength = 0;
333                 }
334             }
335         }
336         return byteOrderMark;
337     }
338 
339     /**
340      * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
341      *
342      * @return The BOM charset Name or null if no BOM found
343      * @throws IOException
344      *             if an error reading the first bytes of the stream occurs
345      */
346     public String getBOMCharsetName() throws IOException {
347         getBOM();
348         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
349     }
350 
351     /**
352      * Tests whether the stream contains one of the specified BOMs.
353      *
354      * @return true if the stream has one of the specified BOMs, otherwise false if it does not
355      * @throws IOException
356      *             if an error reading the first bytes of the stream occurs
357      */
358     public boolean hasBOM() throws IOException {
359         return getBOM() != null;
360     }
361 
362     /**
363      * Tests whether the stream contains the specified BOM.
364      *
365      * @param bom
366      *            The BOM to check for
367      * @return true if the stream has the specified BOM, otherwise false if it does not
368      * @throws IllegalArgumentException
369      *             if the BOM is not one the stream is configured to detect
370      * @throws IOException
371      *             if an error reading the first bytes of the stream occurs
372      */
373     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
374         if (!boms.contains(bom)) {
375             throw new IllegalArgumentException("Stream not configured to detect " + bom);
376         }
377         return Objects.equals(getBOM(), bom);
378     }
379 
380     /**
381      * Invokes the delegate's {@code mark(int)} method.
382      *
383      * @param readLimit
384      *            read ahead limit
385      */
386     @Override
387     public synchronized void mark(final int readLimit) {
388         markFbIndex = fbIndex;
389         markedAtStart = firstBytes == null;
390         in.mark(readLimit);
391     }
392 
393     /**
394      * Checks if the bytes match a BOM.
395      *
396      * @param bom
397      *            The BOM
398      * @return true if the bytes match the bom, otherwise false
399      */
400     private boolean matches(final ByteOrderMark bom) {
401         // if (bom.length() != fbLength) {
402         // return false;
403         // }
404         // firstBytes may be bigger than the BOM bytes
405         for (int i = 0; i < bom.length(); i++) {
406             if (bom.get(i) != firstBytes[i]) {
407                 return false;
408             }
409         }
410         return true;
411     }
412 
413     /**
414      * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
415      *
416      * @return the byte read (excluding BOM) or -1 if the end of stream
417      * @throws IOException
418      *             if an I/O error occurs
419      */
420     @Override
421     public int read() throws IOException {
422         final int b = readFirstBytes();
423         return b >= 0 ? b : in.read();
424     }
425 
426     /**
427      * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
428      *
429      * @param buf
430      *            the buffer to read the bytes into
431      * @return the number of bytes read (excluding BOM) or -1 if the end of stream
432      * @throws IOException
433      *             if an I/O error occurs
434      */
435     @Override
436     public int read(final byte[] buf) throws IOException {
437         return read(buf, 0, buf.length);
438     }
439 
440     /**
441      * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
442      *
443      * @param buf
444      *            the buffer to read the bytes into
445      * @param off
446      *            The start offset
447      * @param len
448      *            The number of bytes to read (excluding BOM)
449      * @return the number of bytes read or -1 if the end of stream
450      * @throws IOException
451      *             if an I/O error occurs
452      */
453     @Override
454     public int read(final byte[] buf, int off, int len) throws IOException {
455         int firstCount = 0;
456         int b = 0;
457         while (len > 0 && b >= 0) {
458             b = readFirstBytes();
459             if (b >= 0) {
460                 buf[off++] = (byte) (b & 0xFF);
461                 len--;
462                 firstCount++;
463             }
464         }
465         final int secondCount = in.read(buf, off, len);
466         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
467     }
468 
469     /**
470      * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
471      * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
472      * processed already.
473      *
474      * @return the byte read (excluding BOM) or -1 if the end of stream
475      * @throws IOException
476      *             if an I/O error occurs
477      */
478     private int readFirstBytes() throws IOException {
479         getBOM();
480         return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
481     }
482 
483     /**
484      * Invokes the delegate's {@code reset()} method.
485      *
486      * @throws IOException
487      *             if an I/O error occurs
488      */
489     @Override
490     public synchronized void reset() throws IOException {
491         fbIndex = markFbIndex;
492         if (markedAtStart) {
493             firstBytes = null;
494         }
495 
496         in.reset();
497     }
498 
499     /**
500      * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
501      *
502      * @param n
503      *            the number of bytes to skip
504      * @return the number of bytes to skipped or -1 if the end of stream
505      * @throws IOException
506      *             if an I/O error occurs
507      */
508     @Override
509     public long skip(final long n) throws IOException {
510         int skipped = 0;
511         while (n > skipped && readFirstBytes() >= 0) {
512             skipped++;
513         }
514         return in.skip(n - skipped) + skipped;
515     }
516 }