001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import java.util.Objects;
027
028import org.apache.commons.io.ByteOrderMark;
029import org.apache.commons.io.IOUtils;
030import org.apache.commons.io.build.AbstractStreamBuilder;
031
032/**
033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
034 * <p>
035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
036 * first byte in the stream.
037 * </p>
038 * <p>
039 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
040 * </p>
041 * <ul>
042 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
043 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
044 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
045 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
046 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
047 * </ul>
048 * <p>
049 * To build an instance, see {@link Builder}.
050 * </p>
051 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
052 *
053 * <pre>
054 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
055 * if (bomIn.hasBOM()) {
056 *     // has a UTF-8 BOM
057 * }
058 * </pre>
059 *
060 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
061 *
062 * <pre>
063 * boolean include = true;
064 * BOMInputStream bomIn = BOMInputStream.builder()
065 *     .setInputStream(in)
066 *     .setInclude(include)
067 *     .get();
068 * if (bomIn.hasBOM()) {
069 *     // has a UTF-8 BOM
070 * }
071 * </pre>
072 *
073 * <h2>Example 3 - Detecting Multiple BOMs</h2>
074 *
075 * <pre>
076 * BOMInputStream bomIn = BOMInputStream.builder()
077 *   .setInputStream(in)
078 *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
079 *   .get();
080 * if (bomIn.hasBOM() == false) {
081 *     // No BOM found
082 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
083 *     // has a UTF-16LE BOM
084 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
085 *     // has a UTF-16BE BOM
086 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
087 *     // has a UTF-32LE BOM
088 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
089 *     // has a UTF-32BE BOM
090 * }
091 * </pre>
092 *
093 * @see org.apache.commons.io.ByteOrderMark
094 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
095 * @since 2.0
096 */
097public class BOMInputStream extends ProxyInputStream {
098
099    /**
100     * Builds a new {@link BOMInputStream} instance.
101     *
102     * <h2>Using NIO</h2>
103     * <pre>{@code
104     * BOMInputStream s = BOMInputStream.builder()
105     *   .setPath(Paths.get("MyFile.xml"))
106     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
107     *   .setInclude(false)
108     *   .get();}
109     * </pre>
110     * <h2>Using IO</h2>
111     * <pre>{@code
112     * BOMInputStream s = BOMInputStream.builder()
113     *   .setFile(new File("MyFile.xml"))
114     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
115     *   .setInclude(false)
116     *   .get();}
117     * </pre>
118     *
119     * @since 2.12.0
120     */
121    public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> {
122
123        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
124
125        /**
126         * For test access.
127         *
128         * @return the default byte order mark
129         */
130        static ByteOrderMark getDefaultByteOrderMark() {
131            return DEFAULT[0];
132        }
133
134        private ByteOrderMark[] byteOrderMarks = DEFAULT;
135
136        private boolean include;
137
138        /**
139         * Constructs a new instance.
140         * <p>
141         * This builder use the aspects InputStream, OpenOption[], include, and ByteOrderMark[].
142         * </p>
143         * <p>
144         * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an
145         * {@link UnsupportedOperationException}.
146         * </p>
147         *
148         * @return a new instance.
149         * @throws UnsupportedOperationException if the origin cannot provide an InputStream.
150         * @see #getInputStream()
151         */
152        @SuppressWarnings("resource")
153        @Override
154        public BOMInputStream get() throws IOException {
155            return new BOMInputStream(getInputStream(), include, byteOrderMarks);
156        }
157
158        /**
159         * Sets the ByteOrderMarks to detect and optionally exclude.
160         * <p>
161         * The default is {@link ByteOrderMark#UTF_8}.
162         * </p>
163         *
164         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
165         * @return this
166         */
167        public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
168            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
169            return this;
170        }
171
172        /**
173         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
174         * <p>
175         * The default is false.
176         * </p>
177         *
178         * @param include true to include the UTF-8 BOM or false to exclude it. return this;
179         * @return this
180         */
181        public Builder setInclude(final boolean include) {
182            this.include = include;
183            return this;
184        }
185
186    }
187
188    /**
189     * Compares ByteOrderMark objects in descending length order.
190     */
191    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
192
193
194    /**
195     * Constructs a new {@link Builder}.
196     *
197     * @return a new {@link Builder}.
198     * @since 2.12.0
199     */
200    public static Builder builder() {
201        return new Builder();
202    }
203
204    /**
205     * BOMs are sorted from longest to shortest.
206     */
207    private final List<ByteOrderMark> boms;
208
209    private ByteOrderMark byteOrderMark;
210    private int fbIndex;
211    private int fbLength;
212    private int[] firstBytes;
213    private final boolean include;
214    private boolean markedAtStart;
215    private int markFbIndex;
216
217    /**
218     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
219     *
220     * @param delegate
221     *            the InputStream to delegate to
222     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
223     */
224    @Deprecated
225    public BOMInputStream(final InputStream delegate) {
226        this(delegate, false, Builder.DEFAULT);
227    }
228
229    /**
230     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
231     *
232     * @param delegate
233     *            the InputStream to delegate to
234     * @param include
235     *            true to include the UTF-8 BOM or false to exclude it
236     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
237     */
238    @Deprecated
239    public BOMInputStream(final InputStream delegate, final boolean include) {
240        this(delegate, include, Builder.DEFAULT);
241    }
242
243    /**
244     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
245     *
246     * @param delegate
247     *            the InputStream to delegate to
248     * @param include
249     *            true to include the specified BOMs or false to exclude them
250     * @param boms
251     *            The BOMs to detect and optionally exclude
252     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
253     */
254    @Deprecated
255    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
256        super(delegate);
257        if (IOUtils.length(boms) == 0) {
258            throw new IllegalArgumentException("No BOMs specified");
259        }
260        this.include = include;
261        final List<ByteOrderMark> list = Arrays.asList(boms);
262        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
263        list.sort(ByteOrderMarkLengthComparator);
264        this.boms = list;
265
266    }
267
268    /**
269     * Constructs a new BOM InputStream that excludes the specified BOMs.
270     *
271     * @param delegate
272     *            the InputStream to delegate to
273     * @param boms
274     *            The BOMs to detect and exclude
275     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
276     */
277    @Deprecated
278    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
279        this(delegate, false, boms);
280    }
281
282    /**
283     * Find a BOM with the specified bytes.
284     *
285     * @return The matched BOM or null if none matched
286     */
287    private ByteOrderMark find() {
288        return boms.stream().filter(this::matches).findFirst().orElse(null);
289    }
290
291    /**
292     * Gets the BOM (Byte Order Mark).
293     *
294     * @return The BOM or null if none
295     * @throws IOException
296     *             if an error reading the first bytes of the stream occurs
297     */
298    public ByteOrderMark getBOM() throws IOException {
299        if (firstBytes == null) {
300            fbLength = 0;
301            // BOMs are sorted from longest to shortest
302            final int maxBomSize = boms.get(0).length();
303            firstBytes = new int[maxBomSize];
304            // Read first maxBomSize bytes
305            for (int i = 0; i < firstBytes.length; i++) {
306                firstBytes[i] = in.read();
307                fbLength++;
308                if (firstBytes[i] < 0) {
309                    break;
310                }
311            }
312            // match BOM in firstBytes
313            byteOrderMark = find();
314            if (byteOrderMark != null && !include) {
315                if (byteOrderMark.length() < firstBytes.length) {
316                    fbIndex = byteOrderMark.length();
317                } else {
318                    fbLength = 0;
319                }
320            }
321        }
322        return byteOrderMark;
323    }
324
325    /**
326     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
327     *
328     * @return The BOM charset Name or null if no BOM found
329     * @throws IOException
330     *             if an error reading the first bytes of the stream occurs
331     *
332     */
333    public String getBOMCharsetName() throws IOException {
334        getBOM();
335        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
336    }
337
338    /**
339     * Tests whether the stream contains one of the specified BOMs.
340     *
341     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
342     * @throws IOException
343     *             if an error reading the first bytes of the stream occurs
344     */
345    public boolean hasBOM() throws IOException {
346        return getBOM() != null;
347    }
348
349    /**
350     * Tests whether the stream contains the specified BOM.
351     *
352     * @param bom
353     *            The BOM to check for
354     * @return true if the stream has the specified BOM, otherwise false if it does not
355     * @throws IllegalArgumentException
356     *             if the BOM is not one the stream is configured to detect
357     * @throws IOException
358     *             if an error reading the first bytes of the stream occurs
359     */
360    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
361        if (!boms.contains(bom)) {
362            throw new IllegalArgumentException("Stream not configured to detect " + bom);
363        }
364        return Objects.equals(getBOM(), bom);
365    }
366
367    /**
368     * Invokes the delegate's {@code mark(int)} method.
369     *
370     * @param readLimit
371     *            read ahead limit
372     */
373    @Override
374    public synchronized void mark(final int readLimit) {
375        markFbIndex = fbIndex;
376        markedAtStart = firstBytes == null;
377        in.mark(readLimit);
378    }
379
380    /**
381     * Checks if the bytes match a BOM.
382     *
383     * @param bom
384     *            The BOM
385     * @return true if the bytes match the bom, otherwise false
386     */
387    private boolean matches(final ByteOrderMark bom) {
388        // if (bom.length() != fbLength) {
389        // return false;
390        // }
391        // firstBytes may be bigger than the BOM bytes
392        for (int i = 0; i < bom.length(); i++) {
393            if (bom.get(i) != firstBytes[i]) {
394                return false;
395            }
396        }
397        return true;
398    }
399
400    /**
401     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
402     *
403     * @return the byte read (excluding BOM) or -1 if the end of stream
404     * @throws IOException
405     *             if an I/O error occurs
406     */
407    @Override
408    public int read() throws IOException {
409        final int b = readFirstBytes();
410        return b >= 0 ? b : in.read();
411    }
412
413    /**
414     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
415     *
416     * @param buf
417     *            the buffer to read the bytes into
418     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
419     * @throws IOException
420     *             if an I/O error occurs
421     */
422    @Override
423    public int read(final byte[] buf) throws IOException {
424        return read(buf, 0, buf.length);
425    }
426
427    /**
428     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
429     *
430     * @param buf
431     *            the buffer to read the bytes into
432     * @param off
433     *            The start offset
434     * @param len
435     *            The number of bytes to read (excluding BOM)
436     * @return the number of bytes read or -1 if the end of stream
437     * @throws IOException
438     *             if an I/O error occurs
439     */
440    @Override
441    public int read(final byte[] buf, int off, int len) throws IOException {
442        int firstCount = 0;
443        int b = 0;
444        while (len > 0 && b >= 0) {
445            b = readFirstBytes();
446            if (b >= 0) {
447                buf[off++] = (byte) (b & 0xFF);
448                len--;
449                firstCount++;
450            }
451        }
452        final int secondCount = in.read(buf, off, len);
453        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
454    }
455
456    /**
457     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
458     * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
459     * processed already.
460     *
461     * @return the byte read (excluding BOM) or -1 if the end of stream
462     * @throws IOException
463     *             if an I/O error occurs
464     */
465    private int readFirstBytes() throws IOException {
466        getBOM();
467        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
468    }
469
470    /**
471     * Invokes the delegate's {@code reset()} method.
472     *
473     * @throws IOException
474     *             if an I/O error occurs
475     */
476    @Override
477    public synchronized void reset() throws IOException {
478        fbIndex = markFbIndex;
479        if (markedAtStart) {
480            firstBytes = null;
481        }
482
483        in.reset();
484    }
485
486    /**
487     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
488     *
489     * @param n
490     *            the number of bytes to skip
491     * @return the number of bytes to skipped or -1 if the end of stream
492     * @throws IOException
493     *             if an I/O error occurs
494     */
495    @Override
496    public long skip(final long n) throws IOException {
497        int skipped = 0;
498        while (n > skipped && readFirstBytes() >= 0) {
499            skipped++;
500        }
501        return in.skip(n - skipped) + skipped;
502    }
503}