001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import org.apache.commons.io.ByteOrderMark;
027import org.apache.commons.io.IOUtils;
028
029/**
030 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
031 * <p>
032 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
033 * first byte in the stream.
034 * </p>
035 * <p>
036 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
037 * </p>
038 * <ul>
039 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
040 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
041 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
042 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
043 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
044 * </ul>
045 *
046 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2>
047 *
048 * <pre>
049 * BOMInputStream bomIn = new BOMInputStream(in);
050 * if (bomIn.hasBOM()) {
051 *     // has a UTF-8 BOM
052 * }
053 * </pre>
054 *
055 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2>
056 *
057 * <pre>
058 * boolean include = true;
059 * BOMInputStream bomIn = new BOMInputStream(in, include);
060 * if (bomIn.hasBOM()) {
061 *     // has a UTF-8 BOM
062 * }
063 * </pre>
064 *
065 * <h2>Example 3 - Detect Multiple BOMs</h2>
066 *
067 * <pre>
068 * BOMInputStream bomIn = new BOMInputStream(in,
069 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
070 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
071 *   );
072 * if (bomIn.hasBOM() == false) {
073 *     // No BOM found
074 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
075 *     // has a UTF-16LE BOM
076 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
077 *     // has a UTF-16BE BOM
078 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
079 *     // has a UTF-32LE BOM
080 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
081 *     // has a UTF-32BE BOM
082 * }
083 * </pre>
084 *
085 * @see org.apache.commons.io.ByteOrderMark
086 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
087 * @since 2.0
088 */
089public class BOMInputStream extends ProxyInputStream {
090    private final boolean include;
091    /**
092     * BOMs are sorted from longest to shortest.
093     */
094    private final List<ByteOrderMark> boms;
095    private ByteOrderMark byteOrderMark;
096    private int[] firstBytes;
097    private int fbLength;
098    private int fbIndex;
099    private int markFbIndex;
100    private boolean markedAtStart;
101
102    /**
103     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
104     *
105     * @param delegate
106     *            the InputStream to delegate to
107     */
108    public BOMInputStream(final InputStream delegate) {
109        this(delegate, false, ByteOrderMark.UTF_8);
110    }
111
112    /**
113     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
114     *
115     * @param delegate
116     *            the InputStream to delegate to
117     * @param include
118     *            true to include the UTF-8 BOM or false to exclude it
119     */
120    public BOMInputStream(final InputStream delegate, final boolean include) {
121        this(delegate, include, ByteOrderMark.UTF_8);
122    }
123
124    /**
125     * Constructs a new BOM InputStream that excludes the specified BOMs.
126     *
127     * @param delegate
128     *            the InputStream to delegate to
129     * @param boms
130     *            The BOMs to detect and exclude
131     */
132    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
133        this(delegate, false, boms);
134    }
135
136    /**
137     * Compares ByteOrderMark objects in descending length order.
138     */
139    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = (bom1, bom2) -> {
140        final int len1 = bom1.length();
141        final int len2 = bom2.length();
142        return Integer.compare(len2, len1);
143    };
144
145    /**
146     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
147     *
148     * @param delegate
149     *            the InputStream to delegate to
150     * @param include
151     *            true to include the specified BOMs or false to exclude them
152     * @param boms
153     *            The BOMs to detect and optionally exclude
154     */
155    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
156        super(delegate);
157        if (IOUtils.length(boms) == 0) {
158            throw new IllegalArgumentException("No BOMs specified");
159        }
160        this.include = include;
161        final List<ByteOrderMark> list = Arrays.asList(boms);
162        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
163        list.sort(ByteOrderMarkLengthComparator);
164        this.boms = list;
165
166    }
167
168    /**
169     * Indicates whether the stream contains one of the specified BOMs.
170     *
171     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
172     * @throws IOException
173     *             if an error reading the first bytes of the stream occurs
174     */
175    public boolean hasBOM() throws IOException {
176        return getBOM() != null;
177    }
178
179    /**
180     * Indicates whether the stream contains the specified BOM.
181     *
182     * @param bom
183     *            The BOM to check for
184     * @return true if the stream has the specified BOM, otherwise false if it does not
185     * @throws IllegalArgumentException
186     *             if the BOM is not one the stream is configured to detect
187     * @throws IOException
188     *             if an error reading the first bytes of the stream occurs
189     */
190    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
191        if (!boms.contains(bom)) {
192            throw new IllegalArgumentException("Stream not configure to detect " + bom);
193        }
194        getBOM();
195        return byteOrderMark != null && byteOrderMark.equals(bom);
196    }
197
198    /**
199     * Return the BOM (Byte Order Mark).
200     *
201     * @return The BOM or null if none
202     * @throws IOException
203     *             if an error reading the first bytes of the stream occurs
204     */
205    public ByteOrderMark getBOM() throws IOException {
206        if (firstBytes == null) {
207            fbLength = 0;
208            // BOMs are sorted from longest to shortest
209            final int maxBomSize = boms.get(0).length();
210            firstBytes = new int[maxBomSize];
211            // Read first maxBomSize bytes
212            for (int i = 0; i < firstBytes.length; i++) {
213                firstBytes[i] = in.read();
214                fbLength++;
215                if (firstBytes[i] < 0) {
216                    break;
217                }
218            }
219            // match BOM in firstBytes
220            byteOrderMark = find();
221            if ((byteOrderMark != null) && !include) {
222                if (byteOrderMark.length() < firstBytes.length) {
223                    fbIndex = byteOrderMark.length();
224                } else {
225                    fbLength = 0;
226                }
227            }
228        }
229        return byteOrderMark;
230    }
231
232    /**
233     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
234     *
235     * @return The BOM charset Name or null if no BOM found
236     * @throws IOException
237     *             if an error reading the first bytes of the stream occurs
238     *
239     */
240    public String getBOMCharsetName() throws IOException {
241        getBOM();
242        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
243    }
244
245    /**
246     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
247     * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
248     * processed already.
249     *
250     * @return the byte read (excluding BOM) or -1 if the end of stream
251     * @throws IOException
252     *             if an I/O error occurs
253     */
254    private int readFirstBytes() throws IOException {
255        getBOM();
256        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
257    }
258
259    /**
260     * Find a BOM with the specified bytes.
261     *
262     * @return The matched BOM or null if none matched
263     */
264    private ByteOrderMark find() {
265        for (final ByteOrderMark bom : boms) {
266            if (matches(bom)) {
267                return bom;
268            }
269        }
270        return null;
271    }
272
273    /**
274     * Check if the bytes match a BOM.
275     *
276     * @param bom
277     *            The BOM
278     * @return true if the bytes match the bom, otherwise false
279     */
280    private boolean matches(final ByteOrderMark bom) {
281        // if (bom.length() != fbLength) {
282        // return false;
283        // }
284        // firstBytes may be bigger than the BOM bytes
285        for (int i = 0; i < bom.length(); i++) {
286            if (bom.get(i) != firstBytes[i]) {
287                return false;
288            }
289        }
290        return true;
291    }
292
293    // ----------------------------------------------------------------------------
294    // Implementation of InputStream
295    // ----------------------------------------------------------------------------
296
297    /**
298     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
299     *
300     * @return the byte read (excluding BOM) or -1 if the end of stream
301     * @throws IOException
302     *             if an I/O error occurs
303     */
304    @Override
305    public int read() throws IOException {
306        final int b = readFirstBytes();
307        return b >= 0 ? b : in.read();
308    }
309
310    /**
311     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
312     *
313     * @param buf
314     *            the buffer to read the bytes into
315     * @param off
316     *            The start offset
317     * @param len
318     *            The number of bytes to read (excluding BOM)
319     * @return the number of bytes read or -1 if the end of stream
320     * @throws IOException
321     *             if an I/O error occurs
322     */
323    @Override
324    public int read(final byte[] buf, int off, int len) throws IOException {
325        int firstCount = 0;
326        int b = 0;
327        while (len > 0 && b >= 0) {
328            b = readFirstBytes();
329            if (b >= 0) {
330                buf[off++] = (byte) (b & 0xFF);
331                len--;
332                firstCount++;
333            }
334        }
335        final int secondCount = in.read(buf, off, len);
336        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
337    }
338
339    /**
340     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
341     *
342     * @param buf
343     *            the buffer to read the bytes into
344     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
345     * @throws IOException
346     *             if an I/O error occurs
347     */
348    @Override
349    public int read(final byte[] buf) throws IOException {
350        return read(buf, 0, buf.length);
351    }
352
353    /**
354     * Invokes the delegate's {@code mark(int)} method.
355     *
356     * @param readlimit
357     *            read ahead limit
358     */
359    @Override
360    public synchronized void mark(final int readlimit) {
361        markFbIndex = fbIndex;
362        markedAtStart = firstBytes == null;
363        in.mark(readlimit);
364    }
365
366    /**
367     * Invokes the delegate's {@code reset()} method.
368     *
369     * @throws IOException
370     *             if an I/O error occurs
371     */
372    @Override
373    public synchronized void reset() throws IOException {
374        fbIndex = markFbIndex;
375        if (markedAtStart) {
376            firstBytes = null;
377        }
378
379        in.reset();
380    }
381
382    /**
383     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
384     *
385     * @param n
386     *            the number of bytes to skip
387     * @return the number of bytes to skipped or -1 if the end of stream
388     * @throws IOException
389     *             if an I/O error occurs
390     */
391    @Override
392    public long skip(final long n) throws IOException {
393        int skipped = 0;
394        while ((n > skipped) && (readFirstBytes() >= 0)) {
395            skipped++;
396        }
397        return in.skip(n - skipped) + skipped;
398    }
399}