Coverage Report - org.apache.commons.io.input.BOMInputStream
 
Classes in this File Line Coverage Branch Coverage Complexity
BOMInputStream
100%
74/74
100%
56/56
2,833
BOMInputStream$1
100%
8/8
100%
4/4
2,833
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  * 
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  * 
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.io.input;
 18  
 
 19  
 import static org.apache.commons.io.IOUtils.EOF;
 20  
 
 21  
 import java.io.IOException;
 22  
 import java.io.InputStream;
 23  
 import java.util.Arrays;
 24  
 import java.util.Collections;
 25  
 import java.util.Comparator;
 26  
 import java.util.List;
 27  
 
 28  
 import org.apache.commons.io.ByteOrderMark;
 29  
 
 30  
 /**
 31  
  * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
 32  
  * 
 33  
  * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
 34  
  * first byte in the stream.
 35  
  * 
 36  
  * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
 37  
  * <ul>
 38  
  * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
 39  
  * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
 40  
  * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
 41  
  * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
 42  
  * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
 43  
  * </ul>
 44  
  * 
 45  
  * 
 46  
  * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
 47  
  * 
 48  
  * <pre>
 49  
  * BOMInputStream bomIn = new BOMInputStream(in);
 50  
  * if (bomIn.hasBOM()) {
 51  
  *     // has a UTF-8 BOM
 52  
  * }
 53  
  * </pre>
 54  
  * 
 55  
  * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
 56  
  * 
 57  
  * <pre>
 58  
  * boolean include = true;
 59  
  * BOMInputStream bomIn = new BOMInputStream(in, include);
 60  
  * if (bomIn.hasBOM()) {
 61  
  *     // has a UTF-8 BOM
 62  
  * }
 63  
  * </pre>
 64  
  * 
 65  
  * <h3>Example 3 - Detect Multiple BOMs</h3>
 66  
  * 
 67  
  * <pre>
 68  
  * BOMInputStream bomIn = new BOMInputStream(in, 
 69  
  *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
 70  
  *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
 71  
  *   );
 72  
  * if (bomIn.hasBOM() == false) {
 73  
  *     // No BOM found
 74  
  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
 75  
  *     // has a UTF-16LE BOM
 76  
  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
 77  
  *     // has a UTF-16BE BOM
 78  
  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
 79  
  *     // has a UTF-32LE BOM
 80  
  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
 81  
  *     // has a UTF-32BE BOM
 82  
  * }
 83  
  * </pre>
 84  
  * 
 85  
  * @see org.apache.commons.io.ByteOrderMark
 86  
  * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
 87  
  * @version $Id$
 88  
  * @since 2.0
 89  
  */
 90  
 public class BOMInputStream extends ProxyInputStream {
 91  
     private final boolean include;
 92  
     /**
 93  
      * BOMs are sorted from longest to shortest.
 94  
      */
 95  
     private final List<ByteOrderMark> boms;
 96  
     private ByteOrderMark byteOrderMark;
 97  
     private int[] firstBytes;
 98  
     private int fbLength;
 99  
     private int fbIndex;
 100  
     private int markFbIndex;
 101  
     private boolean markedAtStart;
 102  
 
 103  
     /**
 104  
      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
 105  
      * 
 106  
      * @param delegate
 107  
      *            the InputStream to delegate to
 108  
      */
 109  
     public BOMInputStream(final InputStream delegate) {
 110  56
         this(delegate, false, ByteOrderMark.UTF_8);
 111  56
     }
 112  
 
 113  
     /**
 114  
      * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
 115  
      * 
 116  
      * @param delegate
 117  
      *            the InputStream to delegate to
 118  
      * @param include
 119  
      *            true to include the UTF-8 BOM or false to exclude it
 120  
      */
 121  
     public BOMInputStream(final InputStream delegate, final boolean include) {
 122  4
         this(delegate, include, ByteOrderMark.UTF_8);
 123  4
     }
 124  
 
 125  
     /**
 126  
      * Constructs a new BOM InputStream that excludes the specified BOMs.
 127  
      * 
 128  
      * @param delegate
 129  
      *            the InputStream to delegate to
 130  
      * @param boms
 131  
      *            The BOMs to detect and exclude
 132  
      */
 133  
     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
 134  22
         this(delegate, false, boms);
 135  22
     }
 136  
 
 137  
     /**
 138  
      * Compares ByteOrderMark objects in descending length order.
 139  
      */
 140  4580
     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
 141  
 
 142  
         @Override
 143  
         public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) {
 144  4574
             final int len1 = bom1.length();
 145  4574
             final int len2 = bom2.length();
 146  4574
             if (len1 > len2) {
 147  62
                 return EOF;
 148  
             }
 149  4512
             if (len2 > len1) {
 150  1494
                 return 1;
 151  
             }
 152  3018
             return 0;
 153  
         }
 154  
     };
 155  
 
 156  
     /**
 157  
      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
 158  
      * 
 159  
      * @param delegate
 160  
      *            the InputStream to delegate to
 161  
      * @param include
 162  
      *            true to include the specified BOMs or false to exclude them
 163  
      * @param boms
 164  
      *            The BOMs to detect and optionally exclude
 165  
      */
 166  
     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
 167  1090
         super(delegate);
 168  1090
         if (boms == null || boms.length == 0) {
 169  4
             throw new IllegalArgumentException("No BOMs specified");
 170  
         }
 171  1086
         this.include = include;
 172  1086
         List<ByteOrderMark> list = Arrays.asList(boms);
 173  
         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
 174  1086
         Collections.sort(list, ByteOrderMarkLengthComparator);
 175  1086
         this.boms = list;
 176  
 
 177  1086
     }
 178  
 
 179  
     /**
 180  
      * Indicates whether the stream contains one of the specified BOMs.
 181  
      * 
 182  
      * @return true if the stream has one of the specified BOMs, otherwise false if it does not
 183  
      * @throws IOException
 184  
      *             if an error reading the first bytes of the stream occurs
 185  
      */
 186  
     public boolean hasBOM() throws IOException {
 187  24
         return getBOM() != null;
 188  
     }
 189  
 
 190  
     /**
 191  
      * Indicates whether the stream contains the specified BOM.
 192  
      * 
 193  
      * @param bom
 194  
      *            The BOM to check for
 195  
      * @return true if the stream has the specified BOM, otherwise false if it does not
 196  
      * @throws IllegalArgumentException
 197  
      *             if the BOM is not one the stream is configured to detect
 198  
      * @throws IOException
 199  
      *             if an error reading the first bytes of the stream occurs
 200  
      */
 201  
     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
 202  36
         if (!boms.contains(bom)) {
 203  10
             throw new IllegalArgumentException("Stream not configure to detect " + bom);
 204  
         }
 205  26
         return byteOrderMark != null && getBOM().equals(bom);
 206  
     }
 207  
 
 208  
     /**
 209  
      * Return the BOM (Byte Order Mark).
 210  
      * 
 211  
      * @return The BOM or null if none
 212  
      * @throws IOException
 213  
      *             if an error reading the first bytes of the stream occurs
 214  
      */
 215  
     public ByteOrderMark getBOM() throws IOException {
 216  12448
         if (firstBytes == null) {
 217  1088
             fbLength = 0;
 218  
             // BOMs are sorted from longest to shortest
 219  1088
             final int maxBomSize = boms.get(0).length();
 220  1088
             firstBytes = new int[maxBomSize];
 221  
             // Read first maxBomSize bytes
 222  6924
             for (int i = 0; i < firstBytes.length; i++) {
 223  6286
                 firstBytes[i] = in.read();
 224  6286
                 fbLength++;
 225  6286
                 if (firstBytes[i] < 0) {
 226  450
                     break;
 227  
                 }
 228  
             }
 229  
             // match BOM in firstBytes
 230  1088
             byteOrderMark = find();
 231  1088
             if (byteOrderMark != null) {
 232  428
                 if (!include) {
 233  192
                     if (byteOrderMark.length() < firstBytes.length) {
 234  96
                         fbIndex = byteOrderMark.length();
 235  
                     } else {
 236  96
                         fbLength = 0;
 237  
                     }
 238  
                 }
 239  
             }
 240  
         }
 241  12448
         return byteOrderMark;
 242  
     }
 243  
 
 244  
     /**
 245  
      * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
 246  
      * 
 247  
      * @return The BOM charset Name or null if no BOM found
 248  
      * @throws IOException
 249  
      *             if an error reading the first bytes of the stream occurs
 250  
      * 
 251  
      */
 252  
     public String getBOMCharsetName() throws IOException {
 253  1004
         getBOM();
 254  1004
         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
 255  
     }
 256  
 
 257  
     /**
 258  
      * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
 259  
      * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
 260  
      * processed already.
 261  
      * 
 262  
      * @return the byte read (excluding BOM) or -1 if the end of stream
 263  
      * @throws IOException
 264  
      *             if an I/O error occurs
 265  
      */
 266  
     private int readFirstBytes() throws IOException {
 267  11376
         getBOM();
 268  11376
         return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
 269  
     }
 270  
 
 271  
     /**
 272  
      * Find a BOM with the specified bytes.
 273  
      * 
 274  
      * @return The matched BOM or null if none matched
 275  
      */
 276  
     private ByteOrderMark find() {
 277  1088
         for (final ByteOrderMark bom : boms) {
 278  4580
             if (matches(bom)) {
 279  428
                 return bom;
 280  
             }
 281  4152
         }
 282  660
         return null;
 283  
     }
 284  
 
 285  
     /**
 286  
      * Check if the bytes match a BOM.
 287  
      * 
 288  
      * @param bom
 289  
      *            The BOM
 290  
      * @return true if the bytes match the bom, otherwise false
 291  
      */
 292  
     private boolean matches(final ByteOrderMark bom) {
 293  
         // if (bom.length() != fbLength) {
 294  
         // return false;
 295  
         // }
 296  
         // firstBytes may be bigger than the BOM bytes
 297  7452
         for (int i = 0; i < bom.length(); i++) {
 298  7024
             if (bom.get(i) != firstBytes[i]) {
 299  4152
                 return false;
 300  
             }
 301  
         }
 302  428
         return true;
 303  
     }
 304  
 
 305  
     // ----------------------------------------------------------------------------
 306  
     // Implementation of InputStream
 307  
     // ----------------------------------------------------------------------------
 308  
 
 309  
     /**
 310  
      * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
 311  
      * 
 312  
      * @return the byte read (excluding BOM) or -1 if the end of stream
 313  
      * @throws IOException
 314  
      *             if an I/O error occurs
 315  
      */
 316  
     @Override
 317  
     public int read() throws IOException {
 318  6354
         final int b = readFirstBytes();
 319  6354
         return b >= 0 ? b : in.read();
 320  
     }
 321  
 
 322  
     /**
 323  
      * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
 324  
      * 
 325  
      * @param buf
 326  
      *            the buffer to read the bytes into
 327  
      * @param off
 328  
      *            The start offset
 329  
      * @param len
 330  
      *            The number of bytes to read (excluding BOM)
 331  
      * @return the number of bytes read or -1 if the end of stream
 332  
      * @throws IOException
 333  
      *             if an I/O error occurs
 334  
      */
 335  
     @Override
 336  
     public int read(final byte[] buf, int off, int len) throws IOException {
 337  1586
         int firstCount = 0;
 338  1586
         int b = 0;
 339  6596
         while (len > 0 && b >= 0) {
 340  5010
             b = readFirstBytes();
 341  5010
             if (b >= 0) {
 342  3870
                 buf[off++] = (byte) (b & 0xFF);
 343  3870
                 len--;
 344  3870
                 firstCount++;
 345  
             }
 346  
         }
 347  1586
         final int secondCount = in.read(buf, off, len);
 348  1586
         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
 349  
     }
 350  
 
 351  
     /**
 352  
      * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
 353  
      * 
 354  
      * @param buf
 355  
      *            the buffer to read the bytes into
 356  
      * @return the number of bytes read (excluding BOM) or -1 if the end of stream
 357  
      * @throws IOException
 358  
      *             if an I/O error occurs
 359  
      */
 360  
     @Override
 361  
     public int read(final byte[] buf) throws IOException {
 362  570
         return read(buf, 0, buf.length);
 363  
     }
 364  
 
 365  
     /**
 366  
      * Invokes the delegate's <code>mark(int)</code> method.
 367  
      * 
 368  
      * @param readlimit
 369  
      *            read ahead limit
 370  
      */
 371  
     @Override
 372  
     public synchronized void mark(final int readlimit) {
 373  476
         markFbIndex = fbIndex;
 374  476
         markedAtStart = firstBytes == null;
 375  476
         in.mark(readlimit);
 376  476
     }
 377  
 
 378  
     /**
 379  
      * Invokes the delegate's <code>reset()</code> method.
 380  
      * 
 381  
      * @throws IOException
 382  
      *             if an I/O error occurs
 383  
      */
 384  
     @Override
 385  
     public synchronized void reset() throws IOException {
 386  476
         fbIndex = markFbIndex;
 387  476
         if (markedAtStart) {
 388  8
             firstBytes = null;
 389  
         }
 390  
 
 391  476
         in.reset();
 392  476
     }
 393  
 
 394  
     /**
 395  
      * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM.
 396  
      * 
 397  
      * @param n
 398  
      *            the number of bytes to skip
 399  
      * @return the number of bytes to skipped or -1 if the end of stream
 400  
      * @throws IOException
 401  
      *             if an I/O error occurs
 402  
      */
 403  
     @Override
 404  
     public long skip(long n) throws IOException {
 405  8
         int skipped = 0;
 406  16
         while ((n > skipped) && (readFirstBytes() >= 0)) {
 407  8
             skipped++;
 408  
         }
 409  8
         return in.skip(n - skipped) + skipped;
 410  
     }
 411  
 }