View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.io.input;
19  
20  import static org.apache.commons.io.IOUtils.EOF;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.util.Arrays;
25  import java.util.Comparator;
26  import java.util.List;
27  import java.util.Objects;
28  
29  import org.apache.commons.io.ByteOrderMark;
30  import org.apache.commons.io.IOUtils;
31  
32  /**
33   * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
34   * <p>
35   * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the first byte in the stream.
36   * </p>
37   * <p>
38   * The {@link ByteOrderMark} implementation has the following predefined BOMs:
39   * </p>
40   * <ul>
41   * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
42   * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
43   * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
44   * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
45   * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
46   * </ul>
47   * <p>
48   * To build an instance, use {@link Builder}.
49   * </p>
50   * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
51   *
52   * <pre>
53   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
54   * if (bomIn.hasBOM()) {
55   *     // has a UTF-8 BOM
56   * }
57   * </pre>
58   *
59   * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
60   *
61   * <pre>
62   * boolean include = true;
63   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).setInclude(include).get();
64   * if (bomIn.hasBOM()) {
65   *     // has a UTF-8 BOM
66   * }
67   * </pre>
68   *
69   * <h2>Example 3 - Detecting Multiple BOMs</h2>
70   *
71   * <pre>
72   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in)
73   *         .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE).get();
74   * if (bomIn.hasBOM() == false) {
75   *     // No BOM found
76   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
77   *     // has a UTF-16LE BOM
78   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
79   *     // has a UTF-16BE BOM
80   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
81   *     // has a UTF-32LE BOM
82   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
83   *     // has a UTF-32BE BOM
84   * }
85   * </pre>
86   * <p>
87   * To build an instance, use {@link Builder}.
88   * </p>
89   * <p>
90   * This class is not thread-safe.
91   * </p>
92   *
93   * @see Builder
94   * @see org.apache.commons.io.ByteOrderMark
95   * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
96   * @since 2.0
97   */
98  public class BOMInputStream extends ProxyInputStream {
99  
100     // @formatter:off
101     /**
102      * Builds a new {@link BOMInputStream}.
103      *
104      * <h2>Using NIO</h2>
105      * <pre>{@code
106      * BOMInputStream s = BOMInputStream.builder()
107      *   .setPath(Paths.get("MyFile.xml"))
108      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
109      *   .setInclude(false)
110      *   .get();}
111      * </pre>
112      * <h2>Using IO</h2>
113      * <pre>{@code
114      * BOMInputStream s = BOMInputStream.builder()
115      *   .setFile(new File("MyFile.xml"))
116      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
117      *   .setInclude(false)
118      *   .get();}
119      * </pre>
120      *
121      * @see #get()
122      * @since 2.12.0
123      */
124     // @formatter:on
125     public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
126 
127         private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
128 
129         /**
130          * For test access.
131          *
132          * @return the default byte order mark.
133          */
134         static ByteOrderMark getDefaultByteOrderMark() {
135             return DEFAULT[0];
136         }
137 
138         private ByteOrderMark[] byteOrderMarks = DEFAULT;
139         private boolean include;
140 
141         /**
142          * Constructs a new builder of {@link BOMInputStream}.
143          */
144         public Builder() {
145             // empty
146         }
147 
148         /**
149          * Builds a new {@link BOMInputStream}.
150          * <p>
151          * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
152          * </p>
153          * <p>
154          * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
155          * </p>
156          * <p>
157          * This builder uses the following aspects:
158          * </p>
159          * <ul>
160          * <li>{@link #getInputStream()}</li>
161          * <li>include}</li>
162          * <li>byteOrderMarks</li>
163          * </ul>
164          *
165          * @return a new instance.
166          * @throws IllegalStateException         if the {@code origin} is {@code null}.
167          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
168          * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
169          * @see #getInputStream()
170          * @see #getUnchecked()
171          */
172         @Override
173         public BOMInputStream get() throws IOException {
174             return new BOMInputStream(this);
175         }
176 
177         /**
178          * Sets the ByteOrderMarks to detect and optionally exclude.
179          * <p>
180          * The default is {@link ByteOrderMark#UTF_8}.
181          * </p>
182          *
183          * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
184          * @return {@code this} instance.
185          */
186         public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
187             this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
188             return this;
189         }
190 
191         /**
192          * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
193          * <p>
194          * The default is false.
195          * </p>
196          *
197          * @param include true to include the UTF-8 BOM or false to exclude it. return this;.
198          * @return {@code this} instance.
199          */
200         public Builder setInclude(final boolean include) {
201             this.include = include;
202             return this;
203         }
204     }
205 
206     /**
207      * Compares ByteOrderMark objects in descending length order.
208      */
209     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
210 
211     /**
212      * Constructs a new {@link Builder}.
213      *
214      * @return a new {@link Builder}.
215      * @since 2.12.0
216      */
217     public static Builder builder() {
218         return new Builder();
219     }
220 
221     /**
222      * BOMs are sorted from longest to shortest.
223      */
224     private final List<ByteOrderMark> bomList;
225     private final ByteOrderMark byteOrderMark;
226     private int fbIndex;
227     private int[] firstBytes;
228     private final boolean include;
229     private boolean markedAtStart;
230     private int markFbIndex;
231 
232     /**
233      * Constructs a new instance.
234      *
235      * @param builder The builder.
236      * @throws IOException if an error reading the first bytes of the stream occurs.
237      */
238     private BOMInputStream(final Builder builder) throws IOException {
239         super(builder);
240         if (IOUtils.length(builder.byteOrderMarks) == 0) {
241             throw new IllegalArgumentException("No ByteOrderMark specified.");
242         }
243         this.include = builder.include;
244         final List<ByteOrderMark> bomList = Arrays.asList(builder.byteOrderMarks);
245         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
246         bomList.sort(ByteOrderMarkLengthComparator);
247         this.bomList = bomList;
248         this.byteOrderMark = readBom();
249     }
250 
251     /**
252      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
253      *
254      * @param delegate the InputStream to delegate to.
255      * @throws IOException if an error reading the first bytes of the stream occurs.
256      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
257      */
258     @Deprecated
259     public BOMInputStream(final InputStream delegate) throws IOException {
260         this(delegate, false, Builder.DEFAULT);
261     }
262 
263     /**
264      * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
265      *
266      * @param delegate the InputStream to delegate to.
267      * @param include  true to include the UTF-8 BOM or false to exclude it.
268      * @throws IOException if an error reading the first bytes of the stream occurs.
269      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
270      */
271     @Deprecated
272     public BOMInputStream(final InputStream delegate, final boolean include) throws IOException {
273         this(delegate, include, Builder.DEFAULT);
274     }
275 
276     /**
277      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
278      *
279      * @param delegate the InputStream to delegate to.
280      * @param include  true to include the specified BOMs or false to exclude them.
281      * @param boms     The BOMs to detect and optionally exclude.
282      * @throws IOException if an error reading the first bytes of the stream occurs.
283      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
284      */
285     @Deprecated
286     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) throws IOException {
287         super(delegate);
288         if (IOUtils.length(boms) == 0) {
289             throw new IllegalArgumentException("No BOMs specified");
290         }
291         this.include = include;
292         final List<ByteOrderMark> list = Arrays.asList(boms);
293         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
294         list.sort(ByteOrderMarkLengthComparator);
295         this.bomList = list;
296         this.byteOrderMark = readBom();
297     }
298 
299     /**
300      * Constructs a new BOM InputStream that excludes the specified BOMs.
301      *
302      * @param delegate the InputStream to delegate to.
303      * @param boms     The BOMs to detect and exclude.
304      * @throws IOException if an error reading the first bytes of the stream occurs.
305      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
306      */
307     @Deprecated
308     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) throws IOException {
309         this(delegate, false, boms);
310     }
311 
312     /**
313      * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
314      *
315      * @return The matched BOM or null if none matched.
316      */
317     private ByteOrderMark find() {
318         return bomList.stream().filter(this::matches).findFirst().orElse(null);
319     }
320 
321     /**
322      * Gets the ByteOrderMark (Byte Order Mark).
323      *
324      * @return The BOM or null if none matched.
325      */
326     public ByteOrderMark getBOM() {
327         return byteOrderMark;
328     }
329 
330     /**
331      * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
332      *
333      * @return The BOM charset Name or null if no BOM found.
334      * @throws IOException if an error reading the first bytes of the stream occurs.
335      */
336     public String getBOMCharsetName() throws IOException {
337         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
338     }
339 
340     /**
341      * Tests whether the stream contains one of the specified BOMs.
342      *
343      * @return true if the stream has one of the specified BOMs, otherwise false if it does not.
344      * @throws IOException if an error reading the first bytes of the stream occurs.
345      */
346     public boolean hasBOM() throws IOException {
347         return getBOM() != null;
348     }
349 
350     /**
351      * Tests whether the stream contains the specified BOM.
352      *
353      * @param bom The BOM to check for.
354      * @return true if the stream has the specified BOM, otherwise false if it does not.
355      * @throws IllegalArgumentException if the BOM is not one the stream is configured to detect.
356      * @throws IOException              if an error reading the first bytes of the stream occurs.
357      */
358     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
359         if (!bomList.contains(bom)) {
360             throw new IllegalArgumentException("Stream not configured to detect " + bom);
361         }
362         return Objects.equals(getBOM(), bom);
363     }
364 
365     /**
366      * Invokes the delegate's {@link InputStream#mark(int)} method.
367      *
368      * @param readLimit read ahead limit.
369      */
370     @Override
371     public synchronized void mark(final int readLimit) {
372         markFbIndex = fbIndex;
373         markedAtStart = firstBytes == null;
374         in.mark(readLimit);
375     }
376 
377     /**
378      * Checks if the bytes match a BOM.
379      *
380      * @param bom The BOM.
381      * @return true if the bytes match the BOM, otherwise false.
382      */
383     private boolean matches(final ByteOrderMark bom) {
384         return bom.matches(firstBytes);
385     }
386 
387     /**
388      * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
389      *
390      * @return the byte read (excluding BOM) or -1 if the end of stream.
391      * @throws IOException if an I/O error occurs.
392      */
393     @Override
394     public int read() throws IOException {
395         checkOpen();
396         final int b = readFirstBytes();
397         return b >= 0 ? b : in.read();
398     }
399 
400     /**
401      * Invokes the delegate's {@link InputStream#read(byte[])} method, detecting and optionally skipping BOM.
402      *
403      * @param buf the buffer to read the bytes into, never {@code null}
404      * @return the number of bytes read (excluding BOM) or -1 if the end of stream.
405      * @throws NullPointerException if the buffer is {@code null}
406      * @throws IOException          if an I/O error occurs.
407      */
408     @Override
409     public int read(final byte[] buf) throws IOException {
410         return read(buf, 0, buf.length);
411     }
412 
413     /**
414      * Invokes the delegate's {@link InputStream#read(byte[], int, int)} method, detecting and optionally skipping BOM.
415      *
416      * @param buf the buffer to read the bytes into.
417      * @param off The start offset.
418      * @param len The number of bytes to read (excluding BOM).
419      * @return the number of bytes read or -1 if the end of stream.
420      * @throws NullPointerException      if the buffer is {@code null}.
421      * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code buf.length}.
422      * @throws IOException               if an I/O error occurs.
423      */
424     @Override
425     public int read(final byte[] buf, int off, int len) throws IOException {
426         IOUtils.checkFromIndexSize(buf, off, len);
427         if (len == 0) {
428             return 0;
429         }
430         int firstCount = 0;
431         int b = 0;
432         while (len > 0 && b >= 0) {
433             b = readFirstBytes();
434             if (b >= 0) {
435                 buf[off++] = (byte) (b & 0xFF);
436                 len--;
437                 firstCount++;
438             }
439         }
440         final int secondCount = in.read(buf, off, len);
441         afterRead(secondCount);
442         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
443     }
444 
445     /**
446      * Reads the byte order mark.
447      *
448      * @return the byte order mark.
449      * @throws IOException if an error reading the first bytes of the stream occurs.
450      */
451     private ByteOrderMark readBom() throws IOException {
452         int fbLength = 0;
453         // BOMs are sorted from longest to shortest
454         final int maxBomSize = bomList.get(0).length();
455         final int[] tmp = new int[maxBomSize];
456         // Read first maxBomSize bytes
457         for (int i = 0; i < tmp.length; i++) {
458             tmp[i] = in.read();
459             afterRead(tmp[i]);
460             fbLength++;
461             if (tmp[i] < 0) {
462                 break;
463             }
464         }
465         firstBytes = Arrays.copyOf(tmp, fbLength);
466         // match BOM in firstBytes
467         final ByteOrderMark bom = find();
468         if (bom != null && !include) {
469             if (bom.length() < firstBytes.length) {
470                 fbIndex = bom.length();
471             } else {
472                 firstBytes = new int[0];
473             }
474         }
475         return bom;
476     }
477 
478     /**
479      * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a
480      * valid byte or -1 to indicate that the initial bytes have been processed already.
481      *
482      * @return the byte read (excluding BOM) or -1 if at the end of first bytes.
483      * @throws IOException if an I/O error occurs.
484      */
485     private int readFirstBytes() throws IOException {
486         return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF;
487     }
488 
489     /**
490      * Invokes the delegate's {@link InputStream#reset()} method.
491      *
492      * @throws IOException if an I/O error occurs.
493      */
494     @Override
495     public synchronized void reset() throws IOException {
496         fbIndex = markFbIndex;
497         if (markedAtStart) {
498             firstBytes = null;
499         }
500         in.reset();
501     }
502 
503     /**
504      * Invokes the delegate's {@link InputStream#skip(long)} method, detecting and optionally skipping BOM.
505      *
506      * @param n the number of bytes to skip.
507      * @return the number of bytes to skipped or -1 if the end of stream.
508      * @throws IOException if an I/O error occurs.
509      */
510     @Override
511     public long skip(final long n) throws IOException {
512         int skipped = 0;
513         while (n > skipped && readFirstBytes() >= 0) {
514             skipped++;
515         }
516         return in.skip(n - skipped) + skipped;
517     }
518 }