001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Collections;
025import java.util.Comparator;
026import java.util.List;
027
028import org.apache.commons.io.ByteOrderMark;
029import org.apache.commons.io.IOUtils;
030
031/**
032 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
033 *
034 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
035 * first byte in the stream.
036 *
037 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
038 * <ul>
039 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
040 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
041 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
042 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
043 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
044 * </ul>
045 *
046 *
047 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2>
048 *
049 * <pre>
050 * BOMInputStream bomIn = new BOMInputStream(in);
051 * if (bomIn.hasBOM()) {
052 *     // has a UTF-8 BOM
053 * }
054 * </pre>
055 *
056 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2>
057 *
058 * <pre>
059 * boolean include = true;
060 * BOMInputStream bomIn = new BOMInputStream(in, include);
061 * if (bomIn.hasBOM()) {
062 *     // has a UTF-8 BOM
063 * }
064 * </pre>
065 *
066 * <h2>Example 3 - Detect Multiple BOMs</h2>
067 *
068 * <pre>
069 * BOMInputStream bomIn = new BOMInputStream(in,
070 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
071 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
072 *   );
073 * if (bomIn.hasBOM() == false) {
074 *     // No BOM found
075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
076 *     // has a UTF-16LE BOM
077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
078 *     // has a UTF-16BE BOM
079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
080 *     // has a UTF-32LE BOM
081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
082 *     // has a UTF-32BE BOM
083 * }
084 * </pre>
085 *
086 * @see org.apache.commons.io.ByteOrderMark
087 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
088 * @since 2.0
089 */
090public class BOMInputStream extends ProxyInputStream {
091    private final boolean include;
092    /**
093     * BOMs are sorted from longest to shortest.
094     */
095    private final List<ByteOrderMark> boms;
096    private ByteOrderMark byteOrderMark;
097    private int[] firstBytes;
098    private int fbLength;
099    private int fbIndex;
100    private int markFbIndex;
101    private boolean markedAtStart;
102
103    /**
104     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
105     *
106     * @param delegate
107     *            the InputStream to delegate to
108     */
109    public BOMInputStream(final InputStream delegate) {
110        this(delegate, false, ByteOrderMark.UTF_8);
111    }
112
113    /**
114     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
115     *
116     * @param delegate
117     *            the InputStream to delegate to
118     * @param include
119     *            true to include the UTF-8 BOM or false to exclude it
120     */
121    public BOMInputStream(final InputStream delegate, final boolean include) {
122        this(delegate, include, ByteOrderMark.UTF_8);
123    }
124
125    /**
126     * Constructs a new BOM InputStream that excludes the specified BOMs.
127     *
128     * @param delegate
129     *            the InputStream to delegate to
130     * @param boms
131     *            The BOMs to detect and exclude
132     */
133    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
134        this(delegate, false, boms);
135    }
136
137    /**
138     * Compares ByteOrderMark objects in descending length order.
139     */
140    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = (bom1, bom2) -> {
141        final int len1 = bom1.length();
142        final int len2 = bom2.length();
143        if (len1 > len2) {
144            return EOF;
145        }
146        if (len2 > len1) {
147            return 1;
148        }
149        return 0;
150    };
151
152    /**
153     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
154     *
155     * @param delegate
156     *            the InputStream to delegate to
157     * @param include
158     *            true to include the specified BOMs or false to exclude them
159     * @param boms
160     *            The BOMs to detect and optionally exclude
161     */
162    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
163        super(delegate);
164        if (IOUtils.length(boms) == 0) {
165            throw new IllegalArgumentException("No BOMs specified");
166        }
167        this.include = include;
168        final List<ByteOrderMark> list = Arrays.asList(boms);
169        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
170        Collections.sort(list, ByteOrderMarkLengthComparator);
171        this.boms = list;
172
173    }
174
175    /**
176     * Indicates whether the stream contains one of the specified BOMs.
177     *
178     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
179     * @throws IOException
180     *             if an error reading the first bytes of the stream occurs
181     */
182    public boolean hasBOM() throws IOException {
183        return getBOM() != null;
184    }
185
186    /**
187     * Indicates whether the stream contains the specified BOM.
188     *
189     * @param bom
190     *            The BOM to check for
191     * @return true if the stream has the specified BOM, otherwise false if it does not
192     * @throws IllegalArgumentException
193     *             if the BOM is not one the stream is configured to detect
194     * @throws IOException
195     *             if an error reading the first bytes of the stream occurs
196     */
197    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
198        if (!boms.contains(bom)) {
199            throw new IllegalArgumentException("Stream not configure to detect " + bom);
200        }
201        getBOM();
202        return byteOrderMark != null && byteOrderMark.equals(bom);
203    }
204
205    /**
206     * Return the BOM (Byte Order Mark).
207     *
208     * @return The BOM or null if none
209     * @throws IOException
210     *             if an error reading the first bytes of the stream occurs
211     */
212    public ByteOrderMark getBOM() throws IOException {
213        if (firstBytes == null) {
214            fbLength = 0;
215            // BOMs are sorted from longest to shortest
216            final int maxBomSize = boms.get(0).length();
217            firstBytes = new int[maxBomSize];
218            // Read first maxBomSize bytes
219            for (int i = 0; i < firstBytes.length; i++) {
220                firstBytes[i] = in.read();
221                fbLength++;
222                if (firstBytes[i] < 0) {
223                    break;
224                }
225            }
226            // match BOM in firstBytes
227            byteOrderMark = find();
228            if (byteOrderMark != null) {
229                if (!include) {
230                    if (byteOrderMark.length() < firstBytes.length) {
231                        fbIndex = byteOrderMark.length();
232                    } else {
233                        fbLength = 0;
234                    }
235                }
236            }
237        }
238        return byteOrderMark;
239    }
240
241    /**
242     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
243     *
244     * @return The BOM charset Name or null if no BOM found
245     * @throws IOException
246     *             if an error reading the first bytes of the stream occurs
247     *
248     */
249    public String getBOMCharsetName() throws IOException {
250        getBOM();
251        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
252    }
253
254    /**
255     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
256     * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
257     * processed already.
258     *
259     * @return the byte read (excluding BOM) or -1 if the end of stream
260     * @throws IOException
261     *             if an I/O error occurs
262     */
263    private int readFirstBytes() throws IOException {
264        getBOM();
265        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
266    }
267
268    /**
269     * Find a BOM with the specified bytes.
270     *
271     * @return The matched BOM or null if none matched
272     */
273    private ByteOrderMark find() {
274        for (final ByteOrderMark bom : boms) {
275            if (matches(bom)) {
276                return bom;
277            }
278        }
279        return null;
280    }
281
282    /**
283     * Check if the bytes match a BOM.
284     *
285     * @param bom
286     *            The BOM
287     * @return true if the bytes match the bom, otherwise false
288     */
289    private boolean matches(final ByteOrderMark bom) {
290        // if (bom.length() != fbLength) {
291        // return false;
292        // }
293        // firstBytes may be bigger than the BOM bytes
294        for (int i = 0; i < bom.length(); i++) {
295            if (bom.get(i) != firstBytes[i]) {
296                return false;
297            }
298        }
299        return true;
300    }
301
302    // ----------------------------------------------------------------------------
303    // Implementation of InputStream
304    // ----------------------------------------------------------------------------
305
306    /**
307     * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
308     *
309     * @return the byte read (excluding BOM) or -1 if the end of stream
310     * @throws IOException
311     *             if an I/O error occurs
312     */
313    @Override
314    public int read() throws IOException {
315        final int b = readFirstBytes();
316        return b >= 0 ? b : in.read();
317    }
318
319    /**
320     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
321     *
322     * @param buf
323     *            the buffer to read the bytes into
324     * @param off
325     *            The start offset
326     * @param len
327     *            The number of bytes to read (excluding BOM)
328     * @return the number of bytes read or -1 if the end of stream
329     * @throws IOException
330     *             if an I/O error occurs
331     */
332    @Override
333    public int read(final byte[] buf, int off, int len) throws IOException {
334        int firstCount = 0;
335        int b = 0;
336        while (len > 0 && b >= 0) {
337            b = readFirstBytes();
338            if (b >= 0) {
339                buf[off++] = (byte) (b & 0xFF);
340                len--;
341                firstCount++;
342            }
343        }
344        final int secondCount = in.read(buf, off, len);
345        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
346    }
347
348    /**
349     * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
350     *
351     * @param buf
352     *            the buffer to read the bytes into
353     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
354     * @throws IOException
355     *             if an I/O error occurs
356     */
357    @Override
358    public int read(final byte[] buf) throws IOException {
359        return read(buf, 0, buf.length);
360    }
361
362    /**
363     * Invokes the delegate's <code>mark(int)</code> method.
364     *
365     * @param readlimit
366     *            read ahead limit
367     */
368    @Override
369    public synchronized void mark(final int readlimit) {
370        markFbIndex = fbIndex;
371        markedAtStart = firstBytes == null;
372        in.mark(readlimit);
373    }
374
375    /**
376     * Invokes the delegate's <code>reset()</code> method.
377     *
378     * @throws IOException
379     *             if an I/O error occurs
380     */
381    @Override
382    public synchronized void reset() throws IOException {
383        fbIndex = markFbIndex;
384        if (markedAtStart) {
385            firstBytes = null;
386        }
387
388        in.reset();
389    }
390
391    /**
392     * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM.
393     *
394     * @param n
395     *            the number of bytes to skip
396     * @return the number of bytes to skipped or -1 if the end of stream
397     * @throws IOException
398     *             if an I/O error occurs
399     */
400    @Override
401    public long skip(final long n) throws IOException {
402        int skipped = 0;
403        while ((n > skipped) && (readFirstBytes() >= 0)) {
404            skipped++;
405        }
406        return in.skip(n - skipped) + skipped;
407    }
408}