001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 * 
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 * 
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026
027import org.apache.commons.io.ByteOrderMark;
028
029/**
030 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
031 * 
032 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
033 * first byte in the stream.
034 * 
035 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
036 * <ul>
037 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
038 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
039 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
040 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
041 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
042 * </ul>
043 * 
044 * 
045 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
046 * 
047 * <pre>
048 * BOMInputStream bomIn = new BOMInputStream(in);
049 * if (bomIn.hasBOM()) {
050 *     // has a UTF-8 BOM
051 * }
052 * </pre>
053 * 
054 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
055 * 
056 * <pre>
057 * boolean include = true;
058 * BOMInputStream bomIn = new BOMInputStream(in, include);
059 * if (bomIn.hasBOM()) {
060 *     // has a UTF-8 BOM
061 * }
062 * </pre>
063 * 
064 * <h3>Example 3 - Detect Multiple BOMs</h3>
065 * 
066 * <pre>
067 * BOMInputStream bomIn = new BOMInputStream(in, 
068 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
069 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
070 *   );
071 * if (bomIn.hasBOM() == false) {
072 *     // No BOM found
073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
074 *     // has a UTF-16LE BOM
075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
076 *     // has a UTF-16BE BOM
077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
078 *     // has a UTF-32LE BOM
079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
080 *     // has a UTF-32BE BOM
081 * }
082 * </pre>
083 * 
084 * @see org.apache.commons.io.ByteOrderMark
085 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
086 * @version $Id: BOMInputStream.java 1686527 2015-06-20 06:31:39Z krosenvold $
087 * @since 2.0
088 */
089public class BOMInputStream extends ProxyInputStream {
090    private final boolean include;
091    /**
092     * BOMs are sorted from longest to shortest.
093     */
094    private final List<ByteOrderMark> boms;
095    private ByteOrderMark byteOrderMark;
096    private int[] firstBytes;
097    private int fbLength;
098    private int fbIndex;
099    private int markFbIndex;
100    private boolean markedAtStart;
101
102    /**
103     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
104     * 
105     * @param delegate
106     *            the InputStream to delegate to
107     */
108    public BOMInputStream(final InputStream delegate) {
109        this(delegate, false, ByteOrderMark.UTF_8);
110    }
111
112    /**
113     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
114     * 
115     * @param delegate
116     *            the InputStream to delegate to
117     * @param include
118     *            true to include the UTF-8 BOM or false to exclude it
119     */
120    public BOMInputStream(final InputStream delegate, final boolean include) {
121        this(delegate, include, ByteOrderMark.UTF_8);
122    }
123
124    /**
125     * Constructs a new BOM InputStream that excludes the specified BOMs.
126     * 
127     * @param delegate
128     *            the InputStream to delegate to
129     * @param boms
130     *            The BOMs to detect and exclude
131     */
132    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
133        this(delegate, false, boms);
134    }
135
136    /**
137     * Compares ByteOrderMark objects in descending length order.
138     */
139    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
140
141        public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) {
142            final int len1 = bom1.length();
143            final int len2 = bom2.length();
144            if (len1 > len2) {
145                return EOF;
146            }
147            if (len2 > len1) {
148                return 1;
149            }
150            return 0;
151        }
152    };
153
154    /**
155     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
156     * 
157     * @param delegate
158     *            the InputStream to delegate to
159     * @param include
160     *            true to include the specified BOMs or false to exclude them
161     * @param boms
162     *            The BOMs to detect and optionally exclude
163     */
164    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
165        super(delegate);
166        if (boms == null || boms.length == 0) {
167            throw new IllegalArgumentException("No BOMs specified");
168        }
169        this.include = include;
170        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
171        Arrays.sort(boms, ByteOrderMarkLengthComparator);
172        this.boms = Arrays.asList(boms);
173
174    }
175
176    /**
177     * Indicates whether the stream contains one of the specified BOMs.
178     * 
179     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
180     * @throws IOException
181     *             if an error reading the first bytes of the stream occurs
182     */
183    public boolean hasBOM() throws IOException {
184        return getBOM() != null;
185    }
186
187    /**
188     * Indicates whether the stream contains the specified BOM.
189     * 
190     * @param bom
191     *            The BOM to check for
192     * @return true if the stream has the specified BOM, otherwise false if it does not
193     * @throws IllegalArgumentException
194     *             if the BOM is not one the stream is configured to detect
195     * @throws IOException
196     *             if an error reading the first bytes of the stream occurs
197     */
198    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
199        if (!boms.contains(bom)) {
200            throw new IllegalArgumentException("Stream not configure to detect " + bom);
201        }
202        return byteOrderMark != null && getBOM().equals(bom);
203    }
204
205    /**
206     * Return the BOM (Byte Order Mark).
207     * 
208     * @return The BOM or null if none
209     * @throws IOException
210     *             if an error reading the first bytes of the stream occurs
211     */
212    public ByteOrderMark getBOM() throws IOException {
213        if (firstBytes == null) {
214            fbLength = 0;
215            // BOMs are sorted from longest to shortest
216            final int maxBomSize = boms.get(0).length();
217            firstBytes = new int[maxBomSize];
218            // Read first maxBomSize bytes
219            for (int i = 0; i < firstBytes.length; i++) {
220                firstBytes[i] = in.read();
221                fbLength++;
222                if (firstBytes[i] < 0) {
223                    break;
224                }
225            }
226            // match BOM in firstBytes
227            byteOrderMark = find();
228            if (byteOrderMark != null) {
229                if (!include) {
230                    if (byteOrderMark.length() < firstBytes.length) {
231                        fbIndex = byteOrderMark.length();
232                    } else {
233                        fbLength = 0;
234                    }
235                }
236            }
237        }
238        return byteOrderMark;
239    }
240
241    /**
242     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
243     * 
244     * @return The BOM charset Name or null if no BOM found
245     * @throws IOException
246     *             if an error reading the first bytes of the stream occurs
247     * 
248     */
249    public String getBOMCharsetName() throws IOException {
250        getBOM();
251        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
252    }
253
254    /**
255     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
256     * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
257     * processed already.
258     * 
259     * @return the byte read (excluding BOM) or -1 if the end of stream
260     * @throws IOException
261     *             if an I/O error occurs
262     */
263    private int readFirstBytes() throws IOException {
264        getBOM();
265        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
266    }
267
268    /**
269     * Find a BOM with the specified bytes.
270     * 
271     * @return The matched BOM or null if none matched
272     */
273    private ByteOrderMark find() {
274        for (final ByteOrderMark bom : boms) {
275            if (matches(bom)) {
276                return bom;
277            }
278        }
279        return null;
280    }
281
282    /**
283     * Check if the bytes match a BOM.
284     * 
285     * @param bom
286     *            The BOM
287     * @return true if the bytes match the bom, otherwise false
288     */
289    private boolean matches(final ByteOrderMark bom) {
290        // if (bom.length() != fbLength) {
291        // return false;
292        // }
293        // firstBytes may be bigger than the BOM bytes
294        for (int i = 0; i < bom.length(); i++) {
295            if (bom.get(i) != firstBytes[i]) {
296                return false;
297            }
298        }
299        return true;
300    }
301
302    // ----------------------------------------------------------------------------
303    // Implementation of InputStream
304    // ----------------------------------------------------------------------------
305
306    /**
307     * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
308     * 
309     * @return the byte read (excluding BOM) or -1 if the end of stream
310     * @throws IOException
311     *             if an I/O error occurs
312     */
313    @Override
314    public int read() throws IOException {
315        final int b = readFirstBytes();
316        return b >= 0 ? b : in.read();
317    }
318
319    /**
320     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
321     * 
322     * @param buf
323     *            the buffer to read the bytes into
324     * @param off
325     *            The start offset
326     * @param len
327     *            The number of bytes to read (excluding BOM)
328     * @return the number of bytes read or -1 if the end of stream
329     * @throws IOException
330     *             if an I/O error occurs
331     */
332    @Override
333    public int read(final byte[] buf, int off, int len) throws IOException {
334        int firstCount = 0;
335        int b = 0;
336        while (len > 0 && b >= 0) {
337            b = readFirstBytes();
338            if (b >= 0) {
339                buf[off++] = (byte) (b & 0xFF);
340                len--;
341                firstCount++;
342            }
343        }
344        final int secondCount = in.read(buf, off, len);
345        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
346    }
347
348    /**
349     * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
350     * 
351     * @param buf
352     *            the buffer to read the bytes into
353     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
354     * @throws IOException
355     *             if an I/O error occurs
356     */
357    @Override
358    public int read(final byte[] buf) throws IOException {
359        return read(buf, 0, buf.length);
360    }
361
362    /**
363     * Invokes the delegate's <code>mark(int)</code> method.
364     * 
365     * @param readlimit
366     *            read ahead limit
367     */
368    @Override
369    public synchronized void mark(final int readlimit) {
370        markFbIndex = fbIndex;
371        markedAtStart = firstBytes == null;
372        in.mark(readlimit);
373    }
374
375    /**
376     * Invokes the delegate's <code>reset()</code> method.
377     * 
378     * @throws IOException
379     *             if an I/O error occurs
380     */
381    @Override
382    public synchronized void reset() throws IOException {
383        fbIndex = markFbIndex;
384        if (markedAtStart) {
385            firstBytes = null;
386        }
387
388        in.reset();
389    }
390
391    /**
392     * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM.
393     * 
394     * @param n
395     *            the number of bytes to skip
396     * @return the number of bytes to skipped or -1 if the end of stream
397     * @throws IOException
398     *             if an I/O error occurs
399     */
400    @Override
401    public long skip(long n) throws IOException {
402        int skipped = 0;
403        while ((n > skipped) && (readFirstBytes() >= 0)) {
404            skipped++;
405        }
406        return in.skip(n - skipped) + skipped;
407    }
408}