001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import java.util.Objects;
027
028import org.apache.commons.io.ByteOrderMark;
029import org.apache.commons.io.IOUtils;
030import org.apache.commons.io.build.AbstractStreamBuilder;
031
032/**
033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
034 * <p>
035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
036 * first byte in the stream.
037 * </p>
038 * <p>
039 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
040 * </p>
041 * <ul>
042 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
043 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
044 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
045 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
046 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
047 * </ul>
048 * <p>
049 * To build an instance, use {@link Builder}.
050 * </p>
051 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
052 *
053 * <pre>
054 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
055 * if (bomIn.hasBOM()) {
056 *     // has a UTF-8 BOM
057 * }
058 * </pre>
059 *
060 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
061 *
062 * <pre>
063 * boolean include = true;
064 * BOMInputStream bomIn = BOMInputStream.builder()
065 *     .setInputStream(in)
066 *     .setInclude(include)
067 *     .get();
068 * if (bomIn.hasBOM()) {
069 *     // has a UTF-8 BOM
070 * }
071 * </pre>
072 *
073 * <h2>Example 3 - Detecting Multiple BOMs</h2>
074 *
075 * <pre>
076 * BOMInputStream bomIn = BOMInputStream.builder()
077 *   .setInputStream(in)
078 *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
079 *   .get();
080 * if (bomIn.hasBOM() == false) {
081 *     // No BOM found
082 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
083 *     // has a UTF-16LE BOM
084 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
085 *     // has a UTF-16BE BOM
086 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
087 *     // has a UTF-32LE BOM
088 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
089 *     // has a UTF-32BE BOM
090 * }
091 * </pre>
092 * <p>
093 * To build an instance, use {@link Builder}.
094 * </p>
095 *
096 * @see Builder
097 * @see org.apache.commons.io.ByteOrderMark
098 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
099 * @since 2.0
100 */
101public class BOMInputStream extends ProxyInputStream {
102
103    // @formatter:off
104    /**
105     * Builds a new {@link BOMInputStream}.
106     *
107     * <h2>Using NIO</h2>
108     * <pre>{@code
109     * BOMInputStream s = BOMInputStream.builder()
110     *   .setPath(Paths.get("MyFile.xml"))
111     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
112     *   .setInclude(false)
113     *   .get();}
114     * </pre>
115     * <h2>Using IO</h2>
116     * <pre>{@code
117     * BOMInputStream s = BOMInputStream.builder()
118     *   .setFile(new File("MyFile.xml"))
119     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
120     *   .setInclude(false)
121     *   .get();}
122     * </pre>
123     *
124     * @see #get()
125     * @since 2.12.0
126     */
127    // @formatter:on
128    public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> {
129
130        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
131
132        /**
133         * For test access.
134         *
135         * @return the default byte order mark
136         */
137        static ByteOrderMark getDefaultByteOrderMark() {
138            return DEFAULT[0];
139        }
140
141        private ByteOrderMark[] byteOrderMarks = DEFAULT;
142
143        private boolean include;
144
145        /**
146         * Builds a new {@link BOMInputStream}.
147         * <p>
148         * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception.
149         * </p>
150         * <p>
151         * This builder use the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
152         * </p>
153         * <p>
154         * This builder use the following aspects:
155         * </p>
156         * <ul>
157         * <li>{@link #getInputStream()}</li>
158         * <li>include}</li>
159         * <li>byteOrderMarks</li>
160         * </ul>
161         *
162         * @return a new instance.
163         * @throws IllegalStateException         if the {@code origin} is {@code null}.
164         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
165         * @throws IOException                   if an I/O error occurs.
166         * @see #getInputStream()
167         */
168        @SuppressWarnings("resource")
169        @Override
170        public BOMInputStream get() throws IOException {
171            return new BOMInputStream(getInputStream(), include, byteOrderMarks);
172        }
173
174        /**
175         * Sets the ByteOrderMarks to detect and optionally exclude.
176         * <p>
177         * The default is {@link ByteOrderMark#UTF_8}.
178         * </p>
179         *
180         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
181         * @return this
182         */
183        public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
184            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
185            return this;
186        }
187
188        /**
189         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
190         * <p>
191         * The default is false.
192         * </p>
193         *
194         * @param include true to include the UTF-8 BOM or false to exclude it. return this;
195         * @return this
196         */
197        public Builder setInclude(final boolean include) {
198            this.include = include;
199            return this;
200        }
201
202    }
203
204    /**
205     * Compares ByteOrderMark objects in descending length order.
206     */
207    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
208
209    /**
210     * Constructs a new {@link Builder}.
211     *
212     * @return a new {@link Builder}.
213     * @since 2.12.0
214     */
215    public static Builder builder() {
216        return new Builder();
217    }
218
219    /**
220     * BOMs are sorted from longest to shortest.
221     */
222    private final List<ByteOrderMark> boms;
223
224    private ByteOrderMark byteOrderMark;
225    private int fbIndex;
226    private int fbLength;
227    private int[] firstBytes;
228    private final boolean include;
229    private boolean markedAtStart;
230    private int markFbIndex;
231
232    /**
233     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
234     *
235     * @param delegate
236     *            the InputStream to delegate to
237     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
238     */
239    @Deprecated
240    public BOMInputStream(final InputStream delegate) {
241        this(delegate, false, Builder.DEFAULT);
242    }
243
244    /**
245     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
246     *
247     * @param delegate
248     *            the InputStream to delegate to
249     * @param include
250     *            true to include the UTF-8 BOM or false to exclude it
251     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
252     */
253    @Deprecated
254    public BOMInputStream(final InputStream delegate, final boolean include) {
255        this(delegate, include, Builder.DEFAULT);
256    }
257
258    /**
259     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
260     *
261     * @param delegate
262     *            the InputStream to delegate to
263     * @param include
264     *            true to include the specified BOMs or false to exclude them
265     * @param boms
266     *            The BOMs to detect and optionally exclude
267     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
268     */
269    @Deprecated
270    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
271        super(delegate);
272        if (IOUtils.length(boms) == 0) {
273            throw new IllegalArgumentException("No BOMs specified");
274        }
275        this.include = include;
276        final List<ByteOrderMark> list = Arrays.asList(boms);
277        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
278        list.sort(ByteOrderMarkLengthComparator);
279        this.boms = list;
280    }
281
282    /**
283     * Constructs a new BOM InputStream that excludes the specified BOMs.
284     *
285     * @param delegate
286     *            the InputStream to delegate to
287     * @param boms
288     *            The BOMs to detect and exclude
289     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
290     */
291    @Deprecated
292    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
293        this(delegate, false, boms);
294    }
295
296    /**
297     * Find a BOM with the specified bytes.
298     *
299     * @return The matched BOM or null if none matched
300     */
301    private ByteOrderMark find() {
302        return boms.stream().filter(this::matches).findFirst().orElse(null);
303    }
304
305    /**
306     * Gets the BOM (Byte Order Mark).
307     *
308     * @return The BOM or null if none
309     * @throws IOException
310     *             if an error reading the first bytes of the stream occurs
311     */
312    public ByteOrderMark getBOM() throws IOException {
313        if (firstBytes == null) {
314            fbLength = 0;
315            // BOMs are sorted from longest to shortest
316            final int maxBomSize = boms.get(0).length();
317            firstBytes = new int[maxBomSize];
318            // Read first maxBomSize bytes
319            for (int i = 0; i < firstBytes.length; i++) {
320                firstBytes[i] = in.read();
321                fbLength++;
322                if (firstBytes[i] < 0) {
323                    break;
324                }
325            }
326            // match BOM in firstBytes
327            byteOrderMark = find();
328            if (byteOrderMark != null && !include) {
329                if (byteOrderMark.length() < firstBytes.length) {
330                    fbIndex = byteOrderMark.length();
331                } else {
332                    fbLength = 0;
333                }
334            }
335        }
336        return byteOrderMark;
337    }
338
339    /**
340     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
341     *
342     * @return The BOM charset Name or null if no BOM found
343     * @throws IOException
344     *             if an error reading the first bytes of the stream occurs
345     */
346    public String getBOMCharsetName() throws IOException {
347        getBOM();
348        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
349    }
350
351    /**
352     * Tests whether the stream contains one of the specified BOMs.
353     *
354     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
355     * @throws IOException
356     *             if an error reading the first bytes of the stream occurs
357     */
358    public boolean hasBOM() throws IOException {
359        return getBOM() != null;
360    }
361
362    /**
363     * Tests whether the stream contains the specified BOM.
364     *
365     * @param bom
366     *            The BOM to check for
367     * @return true if the stream has the specified BOM, otherwise false if it does not
368     * @throws IllegalArgumentException
369     *             if the BOM is not one the stream is configured to detect
370     * @throws IOException
371     *             if an error reading the first bytes of the stream occurs
372     */
373    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
374        if (!boms.contains(bom)) {
375            throw new IllegalArgumentException("Stream not configured to detect " + bom);
376        }
377        return Objects.equals(getBOM(), bom);
378    }
379
380    /**
381     * Invokes the delegate's {@code mark(int)} method.
382     *
383     * @param readLimit
384     *            read ahead limit
385     */
386    @Override
387    public synchronized void mark(final int readLimit) {
388        markFbIndex = fbIndex;
389        markedAtStart = firstBytes == null;
390        in.mark(readLimit);
391    }
392
393    /**
394     * Checks if the bytes match a BOM.
395     *
396     * @param bom
397     *            The BOM
398     * @return true if the bytes match the bom, otherwise false
399     */
400    private boolean matches(final ByteOrderMark bom) {
401        // if (bom.length() != fbLength) {
402        // return false;
403        // }
404        // firstBytes may be bigger than the BOM bytes
405        for (int i = 0; i < bom.length(); i++) {
406            if (bom.get(i) != firstBytes[i]) {
407                return false;
408            }
409        }
410        return true;
411    }
412
413    /**
414     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
415     *
416     * @return the byte read (excluding BOM) or -1 if the end of stream
417     * @throws IOException
418     *             if an I/O error occurs
419     */
420    @Override
421    public int read() throws IOException {
422        final int b = readFirstBytes();
423        return b >= 0 ? b : in.read();
424    }
425
426    /**
427     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
428     *
429     * @param buf
430     *            the buffer to read the bytes into
431     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
432     * @throws IOException
433     *             if an I/O error occurs
434     */
435    @Override
436    public int read(final byte[] buf) throws IOException {
437        return read(buf, 0, buf.length);
438    }
439
440    /**
441     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
442     *
443     * @param buf
444     *            the buffer to read the bytes into
445     * @param off
446     *            The start offset
447     * @param len
448     *            The number of bytes to read (excluding BOM)
449     * @return the number of bytes read or -1 if the end of stream
450     * @throws IOException
451     *             if an I/O error occurs
452     */
453    @Override
454    public int read(final byte[] buf, int off, int len) throws IOException {
455        int firstCount = 0;
456        int b = 0;
457        while (len > 0 && b >= 0) {
458            b = readFirstBytes();
459            if (b >= 0) {
460                buf[off++] = (byte) (b & 0xFF);
461                len--;
462                firstCount++;
463            }
464        }
465        final int secondCount = in.read(buf, off, len);
466        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
467    }
468
469    /**
470     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
471     * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
472     * processed already.
473     *
474     * @return the byte read (excluding BOM) or -1 if the end of stream
475     * @throws IOException
476     *             if an I/O error occurs
477     */
478    private int readFirstBytes() throws IOException {
479        getBOM();
480        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
481    }
482
483    /**
484     * Invokes the delegate's {@code reset()} method.
485     *
486     * @throws IOException
487     *             if an I/O error occurs
488     */
489    @Override
490    public synchronized void reset() throws IOException {
491        fbIndex = markFbIndex;
492        if (markedAtStart) {
493            firstBytes = null;
494        }
495
496        in.reset();
497    }
498
499    /**
500     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
501     *
502     * @param n
503     *            the number of bytes to skip
504     * @return the number of bytes to skipped or -1 if the end of stream
505     * @throws IOException
506     *             if an I/O error occurs
507     */
508    @Override
509    public long skip(final long n) throws IOException {
510        int skipped = 0;
511        while (n > skipped && readFirstBytes() >= 0) {
512            skipped++;
513        }
514        return in.skip(n - skipped) + skipped;
515    }
516}