View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.Arrays;
24  import java.util.Comparator;
25  import java.util.List;
26  import java.util.Objects;
27  
28  import org.apache.commons.io.ByteOrderMark;
29  import org.apache.commons.io.IOUtils;
30  import org.apache.commons.io.build.AbstractStreamBuilder;
31  
32  /**
33   * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
34   * <p>
35   * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
36   * first byte in the stream.
37   * </p>
38   * <p>
39   * The {@link ByteOrderMark} implementation has the following predefined BOMs:
40   * </p>
41   * <ul>
42   * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
43   * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
44   * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
45   * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
46   * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
47   * </ul>
48   * <p>
49   * To build an instance, see {@link Builder}.
50   * </p>
51   * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
52   *
53   * <pre>
54   * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
55   * if (bomIn.hasBOM()) {
56   *     // has a UTF-8 BOM
57   * }
58   * </pre>
59   *
60   * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
61   *
62   * <pre>
63   * boolean include = true;
64   * BOMInputStream bomIn = BOMInputStream.builder()
65   *     .setInputStream(in)
66   *     .setInclude(include)
67   *     .get();
68   * if (bomIn.hasBOM()) {
69   *     // has a UTF-8 BOM
70   * }
71   * </pre>
72   *
73   * <h2>Example 3 - Detecting Multiple BOMs</h2>
74   *
75   * <pre>
76   * BOMInputStream bomIn = BOMInputStream.builder()
77   *   .setInputStream(in)
78   *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
79   *   .get();
80   * if (bomIn.hasBOM() == false) {
81   *     // No BOM found
82   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
83   *     // has a UTF-16LE BOM
84   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
85   *     // has a UTF-16BE BOM
86   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
87   *     // has a UTF-32LE BOM
88   * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
89   *     // has a UTF-32BE BOM
90   * }
91   * </pre>
92   *
93   * @see org.apache.commons.io.ByteOrderMark
94   * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
95   * @since 2.0
96   */
97  public class BOMInputStream extends ProxyInputStream {
98  
99      /**
100      * Builds a new {@link BOMInputStream} instance.
101      *
102      * <h2>Using NIO</h2>
103      * <pre>{@code
104      * BOMInputStream s = BOMInputStream.builder()
105      *   .setPath(Paths.get("MyFile.xml"))
106      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
107      *   .setInclude(false)
108      *   .get();}
109      * </pre>
110      * <h2>Using IO</h2>
111      * <pre>{@code
112      * BOMInputStream s = BOMInputStream.builder()
113      *   .setFile(new File("MyFile.xml"))
114      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
115      *   .setInclude(false)
116      *   .get();}
117      * </pre>
118      *
119      * @since 2.12.0
120      */
121     public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> {
122 
123         private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
124 
125         /**
126          * For test access.
127          *
128          * @return the default byte order mark
129          */
130         static ByteOrderMark getDefaultByteOrderMark() {
131             return DEFAULT[0];
132         }
133 
134         private ByteOrderMark[] byteOrderMarks = DEFAULT;
135 
136         private boolean include;
137 
138         /**
139          * Constructs a new instance.
140          * <p>
141          * This builder use the aspects InputStream, OpenOption[], include, and ByteOrderMark[].
142          * </p>
143          * <p>
144          * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an
145          * {@link UnsupportedOperationException}.
146          * </p>
147          *
148          * @return a new instance.
149          * @throws UnsupportedOperationException if the origin cannot provide an InputStream.
150          * @see #getInputStream()
151          */
152         @SuppressWarnings("resource")
153         @Override
154         public BOMInputStream get() throws IOException {
155             return new BOMInputStream(getInputStream(), include, byteOrderMarks);
156         }
157 
158         /**
159          * Sets the ByteOrderMarks to detect and optionally exclude.
160          * <p>
161          * The default is {@link ByteOrderMark#UTF_8}.
162          * </p>
163          *
164          * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
165          * @return this
166          */
167         public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
168             this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
169             return this;
170         }
171 
172         /**
173          * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
174          * <p>
175          * The default is false.
176          * </p>
177          *
178          * @param include true to include the UTF-8 BOM or false to exclude it. return this;
179          * @return this
180          */
181         public Builder setInclude(final boolean include) {
182             this.include = include;
183             return this;
184         }
185 
186     }
187 
188     /**
189      * Compares ByteOrderMark objects in descending length order.
190      */
191     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
192 
193 
194     /**
195      * Constructs a new {@link Builder}.
196      *
197      * @return a new {@link Builder}.
198      * @since 2.12.0
199      */
200     public static Builder builder() {
201         return new Builder();
202     }
203 
204     /**
205      * BOMs are sorted from longest to shortest.
206      */
207     private final List<ByteOrderMark> boms;
208 
209     private ByteOrderMark byteOrderMark;
210     private int fbIndex;
211     private int fbLength;
212     private int[] firstBytes;
213     private final boolean include;
214     private boolean markedAtStart;
215     private int markFbIndex;
216 
217     /**
218      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
219      *
220      * @param delegate
221      *            the InputStream to delegate to
222      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
223      */
224     @Deprecated
225     public BOMInputStream(final InputStream delegate) {
226         this(delegate, false, Builder.DEFAULT);
227     }
228 
229     /**
230      * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
231      *
232      * @param delegate
233      *            the InputStream to delegate to
234      * @param include
235      *            true to include the UTF-8 BOM or false to exclude it
236      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
237      */
238     @Deprecated
239     public BOMInputStream(final InputStream delegate, final boolean include) {
240         this(delegate, include, Builder.DEFAULT);
241     }
242 
243     /**
244      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
245      *
246      * @param delegate
247      *            the InputStream to delegate to
248      * @param include
249      *            true to include the specified BOMs or false to exclude them
250      * @param boms
251      *            The BOMs to detect and optionally exclude
252      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
253      */
254     @Deprecated
255     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
256         super(delegate);
257         if (IOUtils.length(boms) == 0) {
258             throw new IllegalArgumentException("No BOMs specified");
259         }
260         this.include = include;
261         final List<ByteOrderMark> list = Arrays.asList(boms);
262         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
263         list.sort(ByteOrderMarkLengthComparator);
264         this.boms = list;
265 
266     }
267 
268     /**
269      * Constructs a new BOM InputStream that excludes the specified BOMs.
270      *
271      * @param delegate
272      *            the InputStream to delegate to
273      * @param boms
274      *            The BOMs to detect and exclude
275      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
276      */
277     @Deprecated
278     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
279         this(delegate, false, boms);
280     }
281 
282     /**
283      * Find a BOM with the specified bytes.
284      *
285      * @return The matched BOM or null if none matched
286      */
287     private ByteOrderMark find() {
288         return boms.stream().filter(this::matches).findFirst().orElse(null);
289     }
290 
291     /**
292      * Gets the BOM (Byte Order Mark).
293      *
294      * @return The BOM or null if none
295      * @throws IOException
296      *             if an error reading the first bytes of the stream occurs
297      */
298     public ByteOrderMark getBOM() throws IOException {
299         if (firstBytes == null) {
300             fbLength = 0;
301             // BOMs are sorted from longest to shortest
302             final int maxBomSize = boms.get(0).length();
303             firstBytes = new int[maxBomSize];
304             // Read first maxBomSize bytes
305             for (int i = 0; i < firstBytes.length; i++) {
306                 firstBytes[i] = in.read();
307                 fbLength++;
308                 if (firstBytes[i] < 0) {
309                     break;
310                 }
311             }
312             // match BOM in firstBytes
313             byteOrderMark = find();
314             if (byteOrderMark != null && !include) {
315                 if (byteOrderMark.length() < firstBytes.length) {
316                     fbIndex = byteOrderMark.length();
317                 } else {
318                     fbLength = 0;
319                 }
320             }
321         }
322         return byteOrderMark;
323     }
324 
325     /**
326      * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
327      *
328      * @return The BOM charset Name or null if no BOM found
329      * @throws IOException
330      *             if an error reading the first bytes of the stream occurs
331      *
332      */
333     public String getBOMCharsetName() throws IOException {
334         getBOM();
335         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
336     }
337 
338     /**
339      * Tests whether the stream contains one of the specified BOMs.
340      *
341      * @return true if the stream has one of the specified BOMs, otherwise false if it does not
342      * @throws IOException
343      *             if an error reading the first bytes of the stream occurs
344      */
345     public boolean hasBOM() throws IOException {
346         return getBOM() != null;
347     }
348 
349     /**
350      * Tests whether the stream contains the specified BOM.
351      *
352      * @param bom
353      *            The BOM to check for
354      * @return true if the stream has the specified BOM, otherwise false if it does not
355      * @throws IllegalArgumentException
356      *             if the BOM is not one the stream is configured to detect
357      * @throws IOException
358      *             if an error reading the first bytes of the stream occurs
359      */
360     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
361         if (!boms.contains(bom)) {
362             throw new IllegalArgumentException("Stream not configured to detect " + bom);
363         }
364         return Objects.equals(getBOM(), bom);
365     }
366 
367     /**
368      * Invokes the delegate's {@code mark(int)} method.
369      *
370      * @param readLimit
371      *            read ahead limit
372      */
373     @Override
374     public synchronized void mark(final int readLimit) {
375         markFbIndex = fbIndex;
376         markedAtStart = firstBytes == null;
377         in.mark(readLimit);
378     }
379 
380     /**
381      * Checks if the bytes match a BOM.
382      *
383      * @param bom
384      *            The BOM
385      * @return true if the bytes match the bom, otherwise false
386      */
387     private boolean matches(final ByteOrderMark bom) {
388         // if (bom.length() != fbLength) {
389         // return false;
390         // }
391         // firstBytes may be bigger than the BOM bytes
392         for (int i = 0; i < bom.length(); i++) {
393             if (bom.get(i) != firstBytes[i]) {
394                 return false;
395             }
396         }
397         return true;
398     }
399 
400     /**
401      * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
402      *
403      * @return the byte read (excluding BOM) or -1 if the end of stream
404      * @throws IOException
405      *             if an I/O error occurs
406      */
407     @Override
408     public int read() throws IOException {
409         final int b = readFirstBytes();
410         return b >= 0 ? b : in.read();
411     }
412 
413     /**
414      * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
415      *
416      * @param buf
417      *            the buffer to read the bytes into
418      * @return the number of bytes read (excluding BOM) or -1 if the end of stream
419      * @throws IOException
420      *             if an I/O error occurs
421      */
422     @Override
423     public int read(final byte[] buf) throws IOException {
424         return read(buf, 0, buf.length);
425     }
426 
427     /**
428      * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
429      *
430      * @param buf
431      *            the buffer to read the bytes into
432      * @param off
433      *            The start offset
434      * @param len
435      *            The number of bytes to read (excluding BOM)
436      * @return the number of bytes read or -1 if the end of stream
437      * @throws IOException
438      *             if an I/O error occurs
439      */
440     @Override
441     public int read(final byte[] buf, int off, int len) throws IOException {
442         int firstCount = 0;
443         int b = 0;
444         while (len > 0 && b >= 0) {
445             b = readFirstBytes();
446             if (b >= 0) {
447                 buf[off++] = (byte) (b & 0xFF);
448                 len--;
449                 firstCount++;
450             }
451         }
452         final int secondCount = in.read(buf, off, len);
453         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
454     }
455 
456     /**
457      * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
458      * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
459      * processed already.
460      *
461      * @return the byte read (excluding BOM) or -1 if the end of stream
462      * @throws IOException
463      *             if an I/O error occurs
464      */
465     private int readFirstBytes() throws IOException {
466         getBOM();
467         return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
468     }
469 
470     /**
471      * Invokes the delegate's {@code reset()} method.
472      *
473      * @throws IOException
474      *             if an I/O error occurs
475      */
476     @Override
477     public synchronized void reset() throws IOException {
478         fbIndex = markFbIndex;
479         if (markedAtStart) {
480             firstBytes = null;
481         }
482 
483         in.reset();
484     }
485 
486     /**
487      * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
488      *
489      * @param n
490      *            the number of bytes to skip
491      * @return the number of bytes to skipped or -1 if the end of stream
492      * @throws IOException
493      *             if an I/O error occurs
494      */
495     @Override
496     public long skip(final long n) throws IOException {
497         int skipped = 0;
498         while (n > skipped && readFirstBytes() >= 0) {
499             skipped++;
500         }
501         return in.skip(n - skipped) + skipped;
502     }
503 }