001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import java.util.Objects;
027
028import org.apache.commons.io.ByteOrderMark;
029import org.apache.commons.io.IOUtils;
030
031/**
032 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
033 * <p>
034 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
035 * first byte in the stream.
036 * </p>
037 * <p>
038 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
039 * </p>
040 * <ul>
041 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
042 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
043 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
044 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
045 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
046 * </ul>
047 * <p>
048 * To build an instance, use {@link Builder}.
049 * </p>
050 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
051 *
052 * <pre>
053 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
054 * if (bomIn.hasBOM()) {
055 *     // has a UTF-8 BOM
056 * }
057 * </pre>
058 *
059 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
060 *
061 * <pre>
062 * boolean include = true;
063 * BOMInputStream bomIn = BOMInputStream.builder()
064 *     .setInputStream(in)
065 *     .setInclude(include)
066 *     .get();
067 * if (bomIn.hasBOM()) {
068 *     // has a UTF-8 BOM
069 * }
070 * </pre>
071 *
072 * <h2>Example 3 - Detecting Multiple BOMs</h2>
073 *
074 * <pre>
075 * BOMInputStream bomIn = BOMInputStream.builder()
076 *   .setInputStream(in)
077 *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
078 *   .get();
079 * if (bomIn.hasBOM() == false) {
080 *     // No BOM found
081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
082 *     // has a UTF-16LE BOM
083 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
084 *     // has a UTF-16BE BOM
085 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
086 *     // has a UTF-32LE BOM
087 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
088 *     // has a UTF-32BE BOM
089 * }
090 * </pre>
091 * <p>
092 * To build an instance, use {@link Builder}.
093 * </p>
094 * <p>
095 * This class is not thread-safe.
096 * </p>
097 *
098 * @see Builder
099 * @see org.apache.commons.io.ByteOrderMark
100 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
101 * @since 2.0
102 */
103public class BOMInputStream extends ProxyInputStream {
104
105    // @formatter:off
106    /**
107     * Builds a new {@link BOMInputStream}.
108     *
109     * <h2>Using NIO</h2>
110     * <pre>{@code
111     * BOMInputStream s = BOMInputStream.builder()
112     *   .setPath(Paths.get("MyFile.xml"))
113     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
114     *   .setInclude(false)
115     *   .get();}
116     * </pre>
117     * <h2>Using IO</h2>
118     * <pre>{@code
119     * BOMInputStream s = BOMInputStream.builder()
120     *   .setFile(new File("MyFile.xml"))
121     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
122     *   .setInclude(false)
123     *   .get();}
124     * </pre>
125     *
126     * @see #get()
127     * @since 2.12.0
128     */
129    // @formatter:on
130    public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
131
132        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
133
134        /**
135         * For test access.
136         *
137         * @return the default byte order mark
138         */
139        static ByteOrderMark getDefaultByteOrderMark() {
140            return DEFAULT[0];
141        }
142
143        private ByteOrderMark[] byteOrderMarks = DEFAULT;
144
145        private boolean include;
146
147        /**
148         * Constructs a new builder of {@link BOMInputStream}.
149         */
150        public Builder() {
151            // empty
152        }
153
154        /**
155         * Builds a new {@link BOMInputStream}.
156         * <p>
157         * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
158         * </p>
159         * <p>
160         * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
161         * </p>
162         * <p>
163         * This builder uses the following aspects:
164         * </p>
165         * <ul>
166         * <li>{@link #getInputStream()}</li>
167         * <li>include}</li>
168         * <li>byteOrderMarks</li>
169         * </ul>
170         *
171         * @return a new instance.
172         * @throws IllegalStateException         if the {@code origin} is {@code null}.
173         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
174         * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
175         * @see #getInputStream()
176         * @see #getUnchecked()
177         */
178        @Override
179        public BOMInputStream get() throws IOException {
180            return new BOMInputStream(this);
181        }
182
183        /**
184         * Sets the ByteOrderMarks to detect and optionally exclude.
185         * <p>
186         * The default is {@link ByteOrderMark#UTF_8}.
187         * </p>
188         *
189         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
190         * @return {@code this} instance.
191         */
192        public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
193            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
194            return this;
195        }
196
197        /**
198         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
199         * <p>
200         * The default is false.
201         * </p>
202         *
203         * @param include true to include the UTF-8 BOM or false to exclude it. return this;
204         * @return {@code this} instance.
205         */
206        public Builder setInclude(final boolean include) {
207            this.include = include;
208            return this;
209        }
210
211    }
212
213    /**
214     * Compares ByteOrderMark objects in descending length order.
215     */
216    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
217
218    /**
219     * Constructs a new {@link Builder}.
220     *
221     * @return a new {@link Builder}.
222     * @since 2.12.0
223     */
224    public static Builder builder() {
225        return new Builder();
226    }
227
228    /**
229     * BOMs are sorted from longest to shortest.
230     */
231    private final List<ByteOrderMark> bomList;
232
233    private ByteOrderMark byteOrderMark;
234    private int fbIndex;
235    private int[] firstBytes;
236    private final boolean include;
237    private boolean markedAtStart;
238    private int markFbIndex;
239
240    private BOMInputStream(final Builder builder) throws IOException {
241        super(builder);
242        if (IOUtils.length(builder.byteOrderMarks) == 0) {
243            throw new IllegalArgumentException("No ByteOrderMark specified.");
244        }
245        this.include = builder.include;
246        final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
247        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
248        list.sort(ByteOrderMarkLengthComparator);
249        this.bomList = list;
250    }
251
252    /**
253     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
254     *
255     * @param delegate
256     *            the InputStream to delegate to
257     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
258     */
259    @Deprecated
260    public BOMInputStream(final InputStream delegate) {
261        this(delegate, false, Builder.DEFAULT);
262    }
263
264    /**
265     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
266     *
267     * @param delegate
268     *            the InputStream to delegate to
269     * @param include
270     *            true to include the UTF-8 BOM or false to exclude it
271     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
272     */
273    @Deprecated
274    public BOMInputStream(final InputStream delegate, final boolean include) {
275        this(delegate, include, Builder.DEFAULT);
276    }
277
278    /**
279     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
280     *
281     * @param delegate
282     *            the InputStream to delegate to
283     * @param include
284     *            true to include the specified BOMs or false to exclude them
285     * @param boms
286     *            The BOMs to detect and optionally exclude
287     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
288     */
289    @Deprecated
290    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
291        super(delegate);
292        if (IOUtils.length(boms) == 0) {
293            throw new IllegalArgumentException("No BOMs specified");
294        }
295        this.include = include;
296        final List<ByteOrderMark> list = Arrays.asList(boms);
297        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
298        list.sort(ByteOrderMarkLengthComparator);
299        this.bomList = list;
300    }
301
302    /**
303     * Constructs a new BOM InputStream that excludes the specified BOMs.
304     *
305     * @param delegate
306     *            the InputStream to delegate to
307     * @param boms
308     *            The BOMs to detect and exclude
309     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
310     */
311    @Deprecated
312    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
313        this(delegate, false, boms);
314    }
315
316    /**
317     * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
318     *
319     * @return The matched BOM or null if none matched.
320     */
321    private ByteOrderMark find() {
322        return bomList.stream().filter(this::matches).findFirst().orElse(null);
323    }
324
325    /**
326     * Gets the ByteOrderMark (Byte Order Mark).
327     *
328     * @return The BOM or null if none matched.
329     * @throws IOException
330     *             if an error reading the first bytes of the stream occurs.
331     */
332    public ByteOrderMark getBOM() throws IOException {
333        if (firstBytes == null) {
334            byteOrderMark = readBom();
335        }
336        return byteOrderMark;
337    }
338
339    /**
340     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
341     *
342     * @return The BOM charset Name or null if no BOM found
343     * @throws IOException
344     *             if an error reading the first bytes of the stream occurs
345     */
346    public String getBOMCharsetName() throws IOException {
347        getBOM();
348        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
349    }
350
351    /**
352     * Tests whether the stream contains one of the specified BOMs.
353     *
354     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
355     * @throws IOException
356     *             if an error reading the first bytes of the stream occurs
357     */
358    public boolean hasBOM() throws IOException {
359        return getBOM() != null;
360    }
361
362    /**
363     * Tests whether the stream contains the specified BOM.
364     *
365     * @param bom
366     *            The BOM to check for
367     * @return true if the stream has the specified BOM, otherwise false if it does not
368     * @throws IllegalArgumentException
369     *             if the BOM is not one the stream is configured to detect
370     * @throws IOException
371     *             if an error reading the first bytes of the stream occurs
372     */
373    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
374        if (!bomList.contains(bom)) {
375            throw new IllegalArgumentException("Stream not configured to detect " + bom);
376        }
377        return Objects.equals(getBOM(), bom);
378    }
379
380    /**
381     * Invokes the delegate's {@code mark(int)} method.
382     *
383     * @param readLimit
384     *            read ahead limit
385     */
386    @Override
387    public synchronized void mark(final int readLimit) {
388        markFbIndex = fbIndex;
389        markedAtStart = firstBytes == null;
390        in.mark(readLimit);
391    }
392
393    /**
394     * Checks if the bytes match a BOM.
395     *
396     * @param bom
397     *            The BOM
398     * @return true if the bytes match the bom, otherwise false
399     */
400    private boolean matches(final ByteOrderMark bom) {
401        return bom.matches(firstBytes);
402    }
403
404    /**
405     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
406     *
407     * @return the byte read (excluding BOM) or -1 if the end of stream
408     * @throws IOException
409     *             if an I/O error occurs
410     */
411    @Override
412    public int read() throws IOException {
413        checkOpen();
414        final int b = readFirstBytes();
415        return b >= 0 ? b : in.read();
416    }
417
418    /**
419     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
420     *
421     * @param buf
422     *            the buffer to read the bytes into
423     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
424     * @throws IOException
425     *             if an I/O error occurs
426     */
427    @Override
428    public int read(final byte[] buf) throws IOException {
429        return read(buf, 0, buf.length);
430    }
431
432    /**
433     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
434     *
435     * @param buf
436     *            the buffer to read the bytes into
437     * @param off
438     *            The start offset
439     * @param len
440     *            The number of bytes to read (excluding BOM)
441     * @return the number of bytes read or -1 if the end of stream
442     * @throws IOException
443     *             if an I/O error occurs
444     */
445    @Override
446    public int read(final byte[] buf, int off, int len) throws IOException {
447        int firstCount = 0;
448        int b = 0;
449        while (len > 0 && b >= 0) {
450            b = readFirstBytes();
451            if (b >= 0) {
452                buf[off++] = (byte) (b & 0xFF);
453                len--;
454                firstCount++;
455            }
456        }
457        final int secondCount = in.read(buf, off, len);
458        afterRead(secondCount);
459        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
460    }
461
462    private ByteOrderMark readBom() throws IOException {
463        int fbLength = 0;
464        // BOMs are sorted from longest to shortest
465        final int maxBomSize = bomList.get(0).length();
466        final int[] tmp = new int[maxBomSize];
467        // Read first maxBomSize bytes
468        for (int i = 0; i < tmp.length; i++) {
469            tmp[i] = in.read();
470            afterRead(tmp[i]);
471            fbLength++;
472            if (tmp[i] < 0) {
473                break;
474            }
475        }
476        firstBytes = Arrays.copyOf(tmp, fbLength);
477        // match BOM in firstBytes
478        final ByteOrderMark bom = find();
479        if (bom != null && !include) {
480            if (bom.length() < firstBytes.length) {
481                fbIndex = bom.length();
482            } else {
483                firstBytes = new int[0];
484            }
485        }
486        return bom;
487    }
488
489    /**
490     * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a
491     * valid byte or -1 to indicate that the initial bytes have been processed already.
492     *
493     * @return the byte read (excluding BOM) or -1 if at the end of first bytes.
494     * @throws IOException if an I/O error occurs
495     */
496    private int readFirstBytes() throws IOException {
497        getBOM();
498        return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF;
499    }
500
501    /**
502     * Invokes the delegate's {@code reset()} method.
503     *
504     * @throws IOException
505     *             if an I/O error occurs
506     */
507    @Override
508    public synchronized void reset() throws IOException {
509        fbIndex = markFbIndex;
510        if (markedAtStart) {
511            firstBytes = null;
512        }
513        in.reset();
514    }
515
516    /**
517     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
518     *
519     * @param n
520     *            the number of bytes to skip
521     * @return the number of bytes to skipped or -1 if the end of stream
522     * @throws IOException
523     *             if an I/O error occurs
524     */
525    @Override
526    public long skip(final long n) throws IOException {
527        int skipped = 0;
528        while (n > skipped && readFirstBytes() >= 0) {
529            skipped++;
530        }
531        return in.skip(n - skipped) + skipped;
532    }
533}