001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.io.input;
018    
019    import java.io.IOException;
020    import java.io.InputStream;
021    import java.util.Arrays;
022    import java.util.List;
023    
024    import org.apache.commons.io.ByteOrderMark;
025    
026    /**
027     * This class is used to wrap a stream that includes an encoded
028     * {@link ByteOrderMark} as its first bytes.
029     *
030     * This class detects these bytes and, if required, can automatically skip them
031     * and return the subsequent byte as the first byte in the stream.
032     *
033     * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
034     * <ul>
035     *   <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
036     *   <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
037     *   <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
038     * </ul>
039     *
040     *
041     * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
042     * <pre>
043     *      BOMInputStream bomIn = new BOMInputStream(in);
044     *      if (bomIn.hasBOM()) {
045     *          // has a UTF-8 BOM
046     *      }
047     * </pre>
048     *
049     * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
050     * <pre>
051     *      boolean include = true;
052     *      BOMInputStream bomIn = new BOMInputStream(in, include);
053     *      if (bomIn.hasBOM()) {
054     *          // has a UTF-8 BOM
055     *      }
056     * </pre>
057     *
058     * <h3>Example 3 - Detect Multiple BOMs</h3>
059     * <pre>
060     *      BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
061     *      if (bomIn.hasBOM() == false) {
062     *          // No BOM found
063     *      } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
064     *          // has a UTF-16LE BOM
065     *      } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
066     *          // has a UTF-16BE BOM
067     *      }
068     * </pre>
069     *
070     * @see org.apache.commons.io.ByteOrderMark
071     * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
072     * @version $Revision: 1052095 $ $Date: 2010-12-22 18:03:20 -0500 (Wed, 22 Dec 2010) $
073     * @since Commons IO 2.0
074     */
075    public class BOMInputStream extends ProxyInputStream {
076        private final boolean include;
077        private final List<ByteOrderMark> boms;
078        private ByteOrderMark byteOrderMark;
079        private int[] firstBytes;
080        private int fbLength;
081        private int fbIndex;
082        private int markFbIndex;
083        private boolean markedAtStart;
084    
085        /**
086         * Constructs a new BOM InputStream that excludes
087         * a {@link ByteOrderMark#UTF_8} BOM.
088         * @param delegate the InputStream to delegate to
089         */
090        public BOMInputStream(InputStream delegate) {
091            this(delegate, false, ByteOrderMark.UTF_8);
092        }
093    
094        /**
095         * Constructs a new BOM InputStream that detects a
096         * a {@link ByteOrderMark#UTF_8} and optionally includes it.
097         * @param delegate the InputStream to delegate to
098         * @param include true to include the UTF-8 BOM or
099         * false to exclude it
100         */
101        public BOMInputStream(InputStream delegate, boolean include) {
102            this(delegate, include, ByteOrderMark.UTF_8);
103        }
104    
105        /**
106         * Constructs a new BOM InputStream that excludes
107         * the specified BOMs.
108         * @param delegate the InputStream to delegate to
109         * @param boms The BOMs to detect and exclude
110         */
111        public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
112            this(delegate, false, boms);
113        }
114    
115        /**
116         * Constructs a new BOM InputStream that detects the
117         * specified BOMs and optionally includes them.
118         * @param delegate the InputStream to delegate to
119         * @param include true to include the specified BOMs or
120         * false to exclude them
121         * @param boms The BOMs to detect and optionally exclude
122         */
123        public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
124            super(delegate);
125            if (boms == null || boms.length == 0) {
126                throw new IllegalArgumentException("No BOMs specified");
127            }
128            this.include = include;
129            this.boms = Arrays.asList(boms);
130        }
131    
132        /**
133         * Indicates whether the stream contains one of the specified BOMs.
134         *
135         * @return true if the stream has one of the specified BOMs, otherwise false
136         * if it does not
137         * @throws IOException if an error reading the first bytes of the stream occurs
138         */
139        public boolean hasBOM() throws IOException {
140            return (getBOM() != null);
141        }
142    
143        /**
144         * Indicates whether the stream contains the specified BOM.
145         *
146         * @param bom The BOM to check for
147         * @return true if the stream has the specified BOM, otherwise false
148         * if it does not
149         * @throws IllegalArgumentException if the BOM is not one the stream
150         * is configured to detect
151         * @throws IOException if an error reading the first bytes of the stream occurs
152         */
153        public boolean hasBOM(ByteOrderMark bom) throws IOException {
154            if (!boms.contains(bom)) {
155                throw new IllegalArgumentException("Stream not configure to detect " + bom);
156            }
157            return (byteOrderMark != null && getBOM().equals(bom));
158        }
159    
160        /**
161         * Return the BOM (Byte Order Mark).
162         *
163         * @return The BOM or null if none
164         * @throws IOException if an error reading the first bytes of the stream occurs
165         */
166        public ByteOrderMark getBOM() throws IOException {
167            if (firstBytes == null) {
168                int max = 0;
169                for (ByteOrderMark bom : boms) {
170                    max = Math.max(max, bom.length());
171                }
172                firstBytes = new int[max];
173                for (int i = 0; i < firstBytes.length; i++) {
174                    firstBytes[i] = in.read();
175                    fbLength++;
176                    if (firstBytes[i] < 0) {
177                        break;
178                    }
179    
180                    byteOrderMark = find();
181                    if (byteOrderMark != null) {
182                        if (!include) {
183                            fbLength = 0;
184                        }
185                        break;
186                    }
187                }
188            }
189            return byteOrderMark;
190        }
191    
192        /**
193         * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
194         *
195         * @return The BOM charset Name or null if no BOM found
196         * @throws IOException if an error reading the first bytes of the stream occurs
197         * 
198         */
199        public String getBOMCharsetName() throws IOException {
200            getBOM();
201            return (byteOrderMark == null ? null : byteOrderMark.getCharsetName());
202        }
203    
204        /**
205         * This method reads and either preserves or skips the first bytes in the
206         * stream. It behaves like the single-byte <code>read()</code> method,
207         * either returning a valid byte or -1 to indicate that the initial bytes
208         * have been processed already.
209         * @return the byte read (excluding BOM) or -1 if the end of stream
210         * @throws IOException if an I/O error occurs
211         */
212        private int readFirstBytes() throws IOException {
213            getBOM();
214            return (fbIndex < fbLength) ? firstBytes[fbIndex++] : -1;
215        }
216    
217        /**
218         * Find a BOM with the specified bytes.
219         *
220         * @return The matched BOM or null if none matched
221         */
222        private ByteOrderMark find() {
223            for (ByteOrderMark bom : boms) {
224                if (matches(bom)) {
225                    return bom;
226                }
227            }
228            return null;
229        }
230    
231        /**
232         * Check if the bytes match a BOM.
233         *
234         * @param bom The BOM
235         * @return true if the bytes match the bom, otherwise false
236         */
237        private boolean matches(ByteOrderMark bom) {
238            if (bom.length() != fbLength) {
239                return false;
240            }
241            for (int i = 0; i < bom.length(); i++) {
242                if (bom.get(i) != firstBytes[i]) {
243                    return false;
244                }
245            }
246            return true;
247        }
248    
249        //----------------------------------------------------------------------------
250        //  Implementation of InputStream
251        //----------------------------------------------------------------------------
252    
253        /**
254         * Invokes the delegate's <code>read()</code> method, detecting and
255         * optionally skipping BOM.
256         * @return the byte read (excluding BOM) or -1 if the end of stream
257         * @throws IOException if an I/O error occurs
258         */
259        @Override
260        public int read() throws IOException {
261            int b = readFirstBytes();
262            return (b >= 0) ? b : in.read();
263        }
264    
265        /**
266         * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting
267         * and optionally skipping BOM.
268         * @param buf the buffer to read the bytes into
269         * @param off The start offset
270         * @param len The number of bytes to read (excluding BOM)
271         * @return the number of bytes read or -1 if the end of stream
272         * @throws IOException if an I/O error occurs
273         */
274        @Override
275        public int read(byte[] buf, int off, int len) throws IOException {
276            int firstCount = 0;
277            int b = 0;
278            while ((len > 0) && (b >= 0)) {
279                b = readFirstBytes();
280                if (b >= 0) {
281                    buf[off++] = (byte) (b & 0xFF);
282                    len--;
283                    firstCount++;
284                }
285            }
286            int secondCount = in.read(buf, off, len);
287            return (secondCount < 0) ? (firstCount > 0 ? firstCount : -1) : firstCount + secondCount;
288        }
289    
290        /**
291         * Invokes the delegate's <code>read(byte[])</code> method, detecting and
292         * optionally skipping BOM.
293         * @param buf the buffer to read the bytes into
294         * @return the number of bytes read (excluding BOM)
295         * or -1 if the end of stream
296         * @throws IOException if an I/O error occurs
297         */
298        @Override
299        public int read(byte[] buf) throws IOException {
300            return read(buf, 0, buf.length);
301        }
302    
303        /**
304         * Invokes the delegate's <code>mark(int)</code> method.
305         * @param readlimit read ahead limit
306         */
307        @Override
308        public synchronized void mark(int readlimit) {
309            markFbIndex = fbIndex;
310            markedAtStart = (firstBytes == null);
311            in.mark(readlimit);
312        }
313    
314        /**
315         * Invokes the delegate's <code>reset()</code> method.
316         * @throws IOException if an I/O error occurs
317         */
318        @Override
319        public synchronized void reset() throws IOException {
320            fbIndex = markFbIndex;
321            if (markedAtStart) {
322                firstBytes = null;
323            }
324    
325            in.reset();
326        }
327    
328        /**
329         * Invokes the delegate's <code>skip(long)</code> method, detecting
330         * and optionallyskipping BOM.
331         * @param n the number of bytes to skip
332         * @return the number of bytes to skipped or -1 if the end of stream
333         * @throws IOException if an I/O error occurs
334         */
335        @Override
336        public long skip(long n) throws IOException {
337            while ((n > 0) && (readFirstBytes() >= 0)) {
338                n--;
339            }
340            return in.skip(n);
341        }
342    }