001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import java.util.Objects;
027
028import org.apache.commons.io.ByteOrderMark;
029import org.apache.commons.io.IOUtils;
030import org.apache.commons.io.build.AbstractStreamBuilder;
031
032/**
033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
034 * <p>
035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
036 * first byte in the stream.
037 * </p>
038 * <p>
039 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
040 * </p>
041 * <ul>
042 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
043 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
044 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
045 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
046 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
047 * </ul>
048 *
049 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2>
050 *
051 * <pre>
052 * BOMInputStream bomIn = new BOMInputStream(in);
053 * if (bomIn.hasBOM()) {
054 *     // has a UTF-8 BOM
055 * }
056 * </pre>
057 *
058 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2>
059 *
060 * <pre>
061 * boolean include = true;
062 * BOMInputStream bomIn = new BOMInputStream(in, include);
063 * if (bomIn.hasBOM()) {
064 *     // has a UTF-8 BOM
065 * }
066 * </pre>
067 *
068 * <h2>Example 3 - Detect Multiple BOMs</h2>
069 *
070 * <pre>
071 * BOMInputStream bomIn = new BOMInputStream(in,
072 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
073 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
074 *   );
075 * if (bomIn.hasBOM() == false) {
076 *     // No BOM found
077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
078 *     // has a UTF-16LE BOM
079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
080 *     // has a UTF-16BE BOM
081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
082 *     // has a UTF-32LE BOM
083 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
084 *     // has a UTF-32BE BOM
085 * }
086 * </pre>
087 *
088 * @see org.apache.commons.io.ByteOrderMark
089 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
090 * @since 2.0
091 */
092public class BOMInputStream extends ProxyInputStream {
093
094    /**
095     * Builds a new {@link BOMInputStream} instance.
096     * <p>
097     * For example:
098     * </p>
099     * <pre>{@code
100     * BOMInputStream s = BOMInputStream.builder()
101     *   .setPath(path)
102     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
103     *   .setInclude(false)
104     *   .get()}
105     * </pre>
106     * <p>
107     * @since 2.12.0
108     */
109    public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> {
110
111        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
112
113        // for test access
114        static ByteOrderMark getDefaultBOM() {
115            return DEFAULT[0];
116        }
117
118        private ByteOrderMark[] byteOrderMarks = DEFAULT;
119
120        private boolean include;
121
122        /**
123         * Constructs a new instance.
124         *
125         * @throws UnsupportedOperationException if the origin cannot be converted to an InputStream.
126         */
127        @SuppressWarnings("resource")
128        @Override
129        public BOMInputStream get() throws IOException {
130            return new BOMInputStream(getOrigin().getInputStream(), include, byteOrderMarks);
131        }
132
133        /**
134         * Sets the ByteOrderMarks to detect and optionally exclude.
135         *
136         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
137         * @return this
138         */
139        public Builder setByteOrderMarks(final ByteOrderMark[] byteOrderMarks) {
140            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
141            return this;
142        }
143
144        /**
145         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
146         *
147         * @param include true to include the UTF-8 BOM or false to exclude it. return this;
148         * @return this
149         */
150        public Builder setInclude(final boolean include) {
151            this.include = include;
152            return this;
153        }
154
155    }
156
157    /**
158     * Compares ByteOrderMark objects in descending length order.
159     */
160    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
161
162
163    /**
164     * Constructs a new {@link Builder}.
165     *
166     * @return a new {@link Builder}.
167     * @since 2.12.0
168     */
169    public static Builder builder() {
170        return new Builder();
171    }
172
173    /**
174     * BOMs are sorted from longest to shortest.
175     */
176    private final List<ByteOrderMark> boms;
177
178    private ByteOrderMark byteOrderMark;
179    private int fbIndex;
180    private int fbLength;
181    private int[] firstBytes;
182    private final boolean include;
183    private boolean markedAtStart;
184    private int markFbIndex;
185
186    /**
187     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
188     *
189     * @param delegate
190     *            the InputStream to delegate to
191     * @deprecated Use {@link #builder()}
192     */
193    @Deprecated
194    public BOMInputStream(final InputStream delegate) {
195        this(delegate, false, Builder.DEFAULT);
196    }
197
198    /**
199     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
200     *
201     * @param delegate
202     *            the InputStream to delegate to
203     * @param include
204     *            true to include the UTF-8 BOM or false to exclude it
205     * @deprecated Use {@link #builder()}
206     */
207    @Deprecated
208    public BOMInputStream(final InputStream delegate, final boolean include) {
209        this(delegate, include, Builder.DEFAULT);
210    }
211
212    /**
213     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
214     *
215     * @param delegate
216     *            the InputStream to delegate to
217     * @param include
218     *            true to include the specified BOMs or false to exclude them
219     * @param boms
220     *            The BOMs to detect and optionally exclude
221     * @deprecated Use {@link #builder()}
222     */
223    @Deprecated
224    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
225        super(delegate);
226        if (IOUtils.length(boms) == 0) {
227            throw new IllegalArgumentException("No BOMs specified");
228        }
229        this.include = include;
230        final List<ByteOrderMark> list = Arrays.asList(boms);
231        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
232        list.sort(ByteOrderMarkLengthComparator);
233        this.boms = list;
234
235    }
236
237    /**
238     * Constructs a new BOM InputStream that excludes the specified BOMs.
239     *
240     * @param delegate
241     *            the InputStream to delegate to
242     * @param boms
243     *            The BOMs to detect and exclude
244     * @deprecated Use {@link #builder()}
245     */
246    @Deprecated
247    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
248        this(delegate, false, boms);
249    }
250
251    /**
252     * Find a BOM with the specified bytes.
253     *
254     * @return The matched BOM or null if none matched
255     */
256    private ByteOrderMark find() {
257        return boms.stream().filter(this::matches).findFirst().orElse(null);
258    }
259
260    /**
261     * Gets the BOM (Byte Order Mark).
262     *
263     * @return The BOM or null if none
264     * @throws IOException
265     *             if an error reading the first bytes of the stream occurs
266     */
267    public ByteOrderMark getBOM() throws IOException {
268        if (firstBytes == null) {
269            fbLength = 0;
270            // BOMs are sorted from longest to shortest
271            final int maxBomSize = boms.get(0).length();
272            firstBytes = new int[maxBomSize];
273            // Read first maxBomSize bytes
274            for (int i = 0; i < firstBytes.length; i++) {
275                firstBytes[i] = in.read();
276                fbLength++;
277                if (firstBytes[i] < 0) {
278                    break;
279                }
280            }
281            // match BOM in firstBytes
282            byteOrderMark = find();
283            if (byteOrderMark != null && !include) {
284                if (byteOrderMark.length() < firstBytes.length) {
285                    fbIndex = byteOrderMark.length();
286                } else {
287                    fbLength = 0;
288                }
289            }
290        }
291        return byteOrderMark;
292    }
293
294    /**
295     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
296     *
297     * @return The BOM charset Name or null if no BOM found
298     * @throws IOException
299     *             if an error reading the first bytes of the stream occurs
300     *
301     */
302    public String getBOMCharsetName() throws IOException {
303        getBOM();
304        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
305    }
306
307    /**
308     * Tests whether the stream contains one of the specified BOMs.
309     *
310     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
311     * @throws IOException
312     *             if an error reading the first bytes of the stream occurs
313     */
314    public boolean hasBOM() throws IOException {
315        return getBOM() != null;
316    }
317
318    /**
319     * Tests whether the stream contains the specified BOM.
320     *
321     * @param bom
322     *            The BOM to check for
323     * @return true if the stream has the specified BOM, otherwise false if it does not
324     * @throws IllegalArgumentException
325     *             if the BOM is not one the stream is configured to detect
326     * @throws IOException
327     *             if an error reading the first bytes of the stream occurs
328     */
329    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
330        if (!boms.contains(bom)) {
331            throw new IllegalArgumentException("Stream not configured to detect " + bom);
332        }
333        return Objects.equals(getBOM(), bom);
334    }
335
336    /**
337     * Invokes the delegate's {@code mark(int)} method.
338     *
339     * @param readlimit
340     *            read ahead limit
341     */
342    @Override
343    public synchronized void mark(final int readlimit) {
344        markFbIndex = fbIndex;
345        markedAtStart = firstBytes == null;
346        in.mark(readlimit);
347    }
348
349    /**
350     * Checks if the bytes match a BOM.
351     *
352     * @param bom
353     *            The BOM
354     * @return true if the bytes match the bom, otherwise false
355     */
356    private boolean matches(final ByteOrderMark bom) {
357        // if (bom.length() != fbLength) {
358        // return false;
359        // }
360        // firstBytes may be bigger than the BOM bytes
361        for (int i = 0; i < bom.length(); i++) {
362            if (bom.get(i) != firstBytes[i]) {
363                return false;
364            }
365        }
366        return true;
367    }
368
369    /**
370     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
371     *
372     * @return the byte read (excluding BOM) or -1 if the end of stream
373     * @throws IOException
374     *             if an I/O error occurs
375     */
376    @Override
377    public int read() throws IOException {
378        final int b = readFirstBytes();
379        return b >= 0 ? b : in.read();
380    }
381
382    /**
383     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
384     *
385     * @param buf
386     *            the buffer to read the bytes into
387     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
388     * @throws IOException
389     *             if an I/O error occurs
390     */
391    @Override
392    public int read(final byte[] buf) throws IOException {
393        return read(buf, 0, buf.length);
394    }
395
396    /**
397     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
398     *
399     * @param buf
400     *            the buffer to read the bytes into
401     * @param off
402     *            The start offset
403     * @param len
404     *            The number of bytes to read (excluding BOM)
405     * @return the number of bytes read or -1 if the end of stream
406     * @throws IOException
407     *             if an I/O error occurs
408     */
409    @Override
410    public int read(final byte[] buf, int off, int len) throws IOException {
411        int firstCount = 0;
412        int b = 0;
413        while (len > 0 && b >= 0) {
414            b = readFirstBytes();
415            if (b >= 0) {
416                buf[off++] = (byte) (b & 0xFF);
417                len--;
418                firstCount++;
419            }
420        }
421        final int secondCount = in.read(buf, off, len);
422        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
423    }
424
425    /**
426     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
427     * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
428     * processed already.
429     *
430     * @return the byte read (excluding BOM) or -1 if the end of stream
431     * @throws IOException
432     *             if an I/O error occurs
433     */
434    private int readFirstBytes() throws IOException {
435        getBOM();
436        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
437    }
438
439    /**
440     * Invokes the delegate's {@code reset()} method.
441     *
442     * @throws IOException
443     *             if an I/O error occurs
444     */
445    @Override
446    public synchronized void reset() throws IOException {
447        fbIndex = markFbIndex;
448        if (markedAtStart) {
449            firstBytes = null;
450        }
451
452        in.reset();
453    }
454
455    /**
456     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
457     *
458     * @param n
459     *            the number of bytes to skip
460     * @return the number of bytes to skipped or -1 if the end of stream
461     * @throws IOException
462     *             if an I/O error occurs
463     */
464    @Override
465    public long skip(final long n) throws IOException {
466        int skipped = 0;
467        while (n > skipped && readFirstBytes() >= 0) {
468            skipped++;
469        }
470        return in.skip(n - skipped) + skipped;
471    }
472}