001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.io.input;
018    
019    import java.io.IOException;
020    import java.io.InputStream;
021    import java.util.Arrays;
022    import java.util.Comparator;
023    import java.util.List;
024    
025    import org.apache.commons.io.ByteOrderMark;
026    
027    /**
028     * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
029     * 
030     * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
031     * first byte in the stream.
032     * 
033     * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
034     * <ul>
035     * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
036     * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
037     * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
038     * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
039     * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
040     * </ul>
041     * 
042     * 
043     * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
044     * 
045     * <pre>
046     * BOMInputStream bomIn = new BOMInputStream(in);
047     * if (bomIn.hasBOM()) {
048     *     // has a UTF-8 BOM
049     * }
050     * </pre>
051     * 
052     * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
053     * 
054     * <pre>
055     * boolean include = true;
056     * BOMInputStream bomIn = new BOMInputStream(in, include);
057     * if (bomIn.hasBOM()) {
058     *     // has a UTF-8 BOM
059     * }
060     * </pre>
061     * 
062     * <h3>Example 3 - Detect Multiple BOMs</h3>
063     * 
064     * <pre>
065     * BOMInputStream bomIn = new BOMInputStream(in, 
066     *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
067     *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
068     *   );
069     * if (bomIn.hasBOM() == false) {
070     *     // No BOM found
071     * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
072     *     // has a UTF-16LE BOM
073     * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
074     *     // has a UTF-16BE BOM
075     * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
076     *     // has a UTF-32LE BOM
077     * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
078     *     // has a UTF-32BE BOM
079     * }
080     * </pre>
081     * 
082     * @see org.apache.commons.io.ByteOrderMark
083     * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
084     * @version $Id: BOMInputStream.java 1346400 2012-06-05 14:48:01Z ggregory $
085     * @since 2.0
086     */
087    public class BOMInputStream extends ProxyInputStream {
088        private final boolean include;
089        /**
090         * BOMs are sorted from longest to shortest.
091         */
092        private final List<ByteOrderMark> boms;
093        private ByteOrderMark byteOrderMark;
094        private int[] firstBytes;
095        private int fbLength;
096        private int fbIndex;
097        private int markFbIndex;
098        private boolean markedAtStart;
099    
100        /**
101         * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
102         * 
103         * @param delegate
104         *            the InputStream to delegate to
105         */
106        public BOMInputStream(InputStream delegate) {
107            this(delegate, false, ByteOrderMark.UTF_8);
108        }
109    
110        /**
111         * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
112         * 
113         * @param delegate
114         *            the InputStream to delegate to
115         * @param include
116         *            true to include the UTF-8 BOM or false to exclude it
117         */
118        public BOMInputStream(InputStream delegate, boolean include) {
119            this(delegate, include, ByteOrderMark.UTF_8);
120        }
121    
122        /**
123         * Constructs a new BOM InputStream that excludes the specified BOMs.
124         * 
125         * @param delegate
126         *            the InputStream to delegate to
127         * @param boms
128         *            The BOMs to detect and exclude
129         */
130        public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
131            this(delegate, false, boms);
132        }
133    
134        /**
135         * Compares ByteOrderMark objects in descending length order.
136         */
137        private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
138    
139            public int compare(ByteOrderMark bom1, ByteOrderMark bom2) {
140                int len1 = bom1.length();
141                int len2 = bom2.length();
142                if (len1 > len2) {
143                    return -1;
144                }
145                if (len2 > len1) {
146                    return 1;
147                }
148                return 0;
149            }
150        };
151    
152        /**
153         * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
154         * 
155         * @param delegate
156         *            the InputStream to delegate to
157         * @param include
158         *            true to include the specified BOMs or false to exclude them
159         * @param boms
160         *            The BOMs to detect and optionally exclude
161         */
162        public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
163            super(delegate);
164            if (boms == null || boms.length == 0) {
165                throw new IllegalArgumentException("No BOMs specified");
166            }
167            this.include = include;
168            // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
169            Arrays.sort(boms, ByteOrderMarkLengthComparator);
170            this.boms = Arrays.asList(boms);
171    
172        }
173    
174        /**
175         * Indicates whether the stream contains one of the specified BOMs.
176         * 
177         * @return true if the stream has one of the specified BOMs, otherwise false if it does not
178         * @throws IOException
179         *             if an error reading the first bytes of the stream occurs
180         */
181        public boolean hasBOM() throws IOException {
182            return getBOM() != null;
183        }
184    
185        /**
186         * Indicates whether the stream contains the specified BOM.
187         * 
188         * @param bom
189         *            The BOM to check for
190         * @return true if the stream has the specified BOM, otherwise false if it does not
191         * @throws IllegalArgumentException
192         *             if the BOM is not one the stream is configured to detect
193         * @throws IOException
194         *             if an error reading the first bytes of the stream occurs
195         */
196        public boolean hasBOM(ByteOrderMark bom) throws IOException {
197            if (!boms.contains(bom)) {
198                throw new IllegalArgumentException("Stream not configure to detect " + bom);
199            }
200            return byteOrderMark != null && getBOM().equals(bom);
201        }
202    
203        /**
204         * Return the BOM (Byte Order Mark).
205         * 
206         * @return The BOM or null if none
207         * @throws IOException
208         *             if an error reading the first bytes of the stream occurs
209         */
210        public ByteOrderMark getBOM() throws IOException {
211            if (firstBytes == null) {
212                fbLength = 0;
213                // BOMs are sorted from longest to shortest
214                final int maxBomSize = boms.get(0).length();
215                firstBytes = new int[maxBomSize];
216                // Read first maxBomSize bytes
217                for (int i = 0; i < firstBytes.length; i++) {
218                    firstBytes[i] = in.read();
219                    fbLength++;
220                    if (firstBytes[i] < 0) {
221                        break;
222                    }
223                }
224                // match BOM in firstBytes
225                byteOrderMark = find();
226                if (byteOrderMark != null) {
227                    if (!include) {
228                        if (byteOrderMark.length() < firstBytes.length) {
229                            fbIndex = byteOrderMark.length();
230                        } else {
231                            fbLength = 0;
232                        }
233                    }
234                }
235            }
236            return byteOrderMark;
237        }
238    
239        /**
240         * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
241         * 
242         * @return The BOM charset Name or null if no BOM found
243         * @throws IOException
244         *             if an error reading the first bytes of the stream occurs
245         * 
246         */
247        public String getBOMCharsetName() throws IOException {
248            getBOM();
249            return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
250        }
251    
252        /**
253         * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
254         * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
255         * processed already.
256         * 
257         * @return the byte read (excluding BOM) or -1 if the end of stream
258         * @throws IOException
259         *             if an I/O error occurs
260         */
261        private int readFirstBytes() throws IOException {
262            getBOM();
263            return fbIndex < fbLength ? firstBytes[fbIndex++] : -1;
264        }
265    
266        /**
267         * Find a BOM with the specified bytes.
268         * 
269         * @return The matched BOM or null if none matched
270         */
271        private ByteOrderMark find() {
272            for (ByteOrderMark bom : boms) {
273                if (matches(bom)) {
274                    return bom;
275                }
276            }
277            return null;
278        }
279    
280        /**
281         * Check if the bytes match a BOM.
282         * 
283         * @param bom
284         *            The BOM
285         * @return true if the bytes match the bom, otherwise false
286         */
287        private boolean matches(ByteOrderMark bom) {
288            // if (bom.length() != fbLength) {
289            // return false;
290            // }
291            // firstBytes may be bigger than the BOM bytes
292            for (int i = 0; i < bom.length(); i++) {
293                if (bom.get(i) != firstBytes[i]) {
294                    return false;
295                }
296            }
297            return true;
298        }
299    
300        // ----------------------------------------------------------------------------
301        // Implementation of InputStream
302        // ----------------------------------------------------------------------------
303    
304        /**
305         * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
306         * 
307         * @return the byte read (excluding BOM) or -1 if the end of stream
308         * @throws IOException
309         *             if an I/O error occurs
310         */
311        @Override
312        public int read() throws IOException {
313            int b = readFirstBytes();
314            return b >= 0 ? b : in.read();
315        }
316    
317        /**
318         * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
319         * 
320         * @param buf
321         *            the buffer to read the bytes into
322         * @param off
323         *            The start offset
324         * @param len
325         *            The number of bytes to read (excluding BOM)
326         * @return the number of bytes read or -1 if the end of stream
327         * @throws IOException
328         *             if an I/O error occurs
329         */
330        @Override
331        public int read(byte[] buf, int off, int len) throws IOException {
332            int firstCount = 0;
333            int b = 0;
334            while (len > 0 && b >= 0) {
335                b = readFirstBytes();
336                if (b >= 0) {
337                    buf[off++] = (byte) (b & 0xFF);
338                    len--;
339                    firstCount++;
340                }
341            }
342            int secondCount = in.read(buf, off, len);
343            return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount;
344        }
345    
346        /**
347         * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
348         * 
349         * @param buf
350         *            the buffer to read the bytes into
351         * @return the number of bytes read (excluding BOM) or -1 if the end of stream
352         * @throws IOException
353         *             if an I/O error occurs
354         */
355        @Override
356        public int read(byte[] buf) throws IOException {
357            return read(buf, 0, buf.length);
358        }
359    
360        /**
361         * Invokes the delegate's <code>mark(int)</code> method.
362         * 
363         * @param readlimit
364         *            read ahead limit
365         */
366        @Override
367        public synchronized void mark(int readlimit) {
368            markFbIndex = fbIndex;
369            markedAtStart = firstBytes == null;
370            in.mark(readlimit);
371        }
372    
373        /**
374         * Invokes the delegate's <code>reset()</code> method.
375         * 
376         * @throws IOException
377         *             if an I/O error occurs
378         */
379        @Override
380        public synchronized void reset() throws IOException {
381            fbIndex = markFbIndex;
382            if (markedAtStart) {
383                firstBytes = null;
384            }
385    
386            in.reset();
387        }
388    
389        /**
390         * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM.
391         * 
392         * @param n
393         *            the number of bytes to skip
394         * @return the number of bytes to skipped or -1 if the end of stream
395         * @throws IOException
396         *             if an I/O error occurs
397         */
398        @Override
399        public long skip(long n) throws IOException {
400            while (n > 0 && readFirstBytes() >= 0) {
401                n--;
402            }
403            return in.skip(n);
404        }
405    }