001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026 027import org.apache.commons.io.ByteOrderMark; 028 029/** 030 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 031 * 032 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 033 * first byte in the stream. 034 * 035 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 036 * <ul> 037 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 038 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 039 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 040 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 041 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 042 * </ul> 043 * 044 * 045 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> 046 * 047 * <pre> 048 * BOMInputStream bomIn = new BOMInputStream(in); 049 * if (bomIn.hasBOM()) { 050 * // has a UTF-8 BOM 051 * } 052 * </pre> 053 * 054 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> 055 * 056 * <pre> 057 * boolean include = true; 058 * BOMInputStream bomIn = new BOMInputStream(in, include); 059 * if (bomIn.hasBOM()) { 060 * // has a UTF-8 BOM 061 * } 062 * </pre> 063 * 064 * <h3>Example 3 - Detect Multiple BOMs</h3> 065 * 066 * <pre> 067 * BOMInputStream bomIn = new BOMInputStream(in, 068 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 069 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 070 * ); 071 * if (bomIn.hasBOM() == false) { 072 * // No BOM found 073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 074 * // has a UTF-16LE BOM 075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 076 * // has a UTF-16BE BOM 077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 078 * // has a UTF-32LE BOM 079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 080 * // has a UTF-32BE BOM 081 * } 082 * </pre> 083 * 084 * @see org.apache.commons.io.ByteOrderMark 085 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 086 * @version $Id: BOMInputStream.java 1686527 2015-06-20 06:31:39Z krosenvold $ 087 * @since 2.0 088 */ 089public class BOMInputStream extends ProxyInputStream { 090 private final boolean include; 091 /** 092 * BOMs are sorted from longest to shortest. 093 */ 094 private final List<ByteOrderMark> boms; 095 private ByteOrderMark byteOrderMark; 096 private int[] firstBytes; 097 private int fbLength; 098 private int fbIndex; 099 private int markFbIndex; 100 private boolean markedAtStart; 101 102 /** 103 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 104 * 105 * @param delegate 106 * the InputStream to delegate to 107 */ 108 public BOMInputStream(final InputStream delegate) { 109 this(delegate, false, ByteOrderMark.UTF_8); 110 } 111 112 /** 113 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. 114 * 115 * @param delegate 116 * the InputStream to delegate to 117 * @param include 118 * true to include the UTF-8 BOM or false to exclude it 119 */ 120 public BOMInputStream(final InputStream delegate, final boolean include) { 121 this(delegate, include, ByteOrderMark.UTF_8); 122 } 123 124 /** 125 * Constructs a new BOM InputStream that excludes the specified BOMs. 126 * 127 * @param delegate 128 * the InputStream to delegate to 129 * @param boms 130 * The BOMs to detect and exclude 131 */ 132 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 133 this(delegate, false, boms); 134 } 135 136 /** 137 * Compares ByteOrderMark objects in descending length order. 138 */ 139 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() { 140 141 public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) { 142 final int len1 = bom1.length(); 143 final int len2 = bom2.length(); 144 if (len1 > len2) { 145 return EOF; 146 } 147 if (len2 > len1) { 148 return 1; 149 } 150 return 0; 151 } 152 }; 153 154 /** 155 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 156 * 157 * @param delegate 158 * the InputStream to delegate to 159 * @param include 160 * true to include the specified BOMs or false to exclude them 161 * @param boms 162 * The BOMs to detect and optionally exclude 163 */ 164 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 165 super(delegate); 166 if (boms == null || boms.length == 0) { 167 throw new IllegalArgumentException("No BOMs specified"); 168 } 169 this.include = include; 170 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 171 Arrays.sort(boms, ByteOrderMarkLengthComparator); 172 this.boms = Arrays.asList(boms); 173 174 } 175 176 /** 177 * Indicates whether the stream contains one of the specified BOMs. 178 * 179 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 180 * @throws IOException 181 * if an error reading the first bytes of the stream occurs 182 */ 183 public boolean hasBOM() throws IOException { 184 return getBOM() != null; 185 } 186 187 /** 188 * Indicates whether the stream contains the specified BOM. 189 * 190 * @param bom 191 * The BOM to check for 192 * @return true if the stream has the specified BOM, otherwise false if it does not 193 * @throws IllegalArgumentException 194 * if the BOM is not one the stream is configured to detect 195 * @throws IOException 196 * if an error reading the first bytes of the stream occurs 197 */ 198 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 199 if (!boms.contains(bom)) { 200 throw new IllegalArgumentException("Stream not configure to detect " + bom); 201 } 202 return byteOrderMark != null && getBOM().equals(bom); 203 } 204 205 /** 206 * Return the BOM (Byte Order Mark). 207 * 208 * @return The BOM or null if none 209 * @throws IOException 210 * if an error reading the first bytes of the stream occurs 211 */ 212 public ByteOrderMark getBOM() throws IOException { 213 if (firstBytes == null) { 214 fbLength = 0; 215 // BOMs are sorted from longest to shortest 216 final int maxBomSize = boms.get(0).length(); 217 firstBytes = new int[maxBomSize]; 218 // Read first maxBomSize bytes 219 for (int i = 0; i < firstBytes.length; i++) { 220 firstBytes[i] = in.read(); 221 fbLength++; 222 if (firstBytes[i] < 0) { 223 break; 224 } 225 } 226 // match BOM in firstBytes 227 byteOrderMark = find(); 228 if (byteOrderMark != null) { 229 if (!include) { 230 if (byteOrderMark.length() < firstBytes.length) { 231 fbIndex = byteOrderMark.length(); 232 } else { 233 fbLength = 0; 234 } 235 } 236 } 237 } 238 return byteOrderMark; 239 } 240 241 /** 242 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 243 * 244 * @return The BOM charset Name or null if no BOM found 245 * @throws IOException 246 * if an error reading the first bytes of the stream occurs 247 * 248 */ 249 public String getBOMCharsetName() throws IOException { 250 getBOM(); 251 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 252 } 253 254 /** 255 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 256 * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been 257 * processed already. 258 * 259 * @return the byte read (excluding BOM) or -1 if the end of stream 260 * @throws IOException 261 * if an I/O error occurs 262 */ 263 private int readFirstBytes() throws IOException { 264 getBOM(); 265 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 266 } 267 268 /** 269 * Find a BOM with the specified bytes. 270 * 271 * @return The matched BOM or null if none matched 272 */ 273 private ByteOrderMark find() { 274 for (final ByteOrderMark bom : boms) { 275 if (matches(bom)) { 276 return bom; 277 } 278 } 279 return null; 280 } 281 282 /** 283 * Check if the bytes match a BOM. 284 * 285 * @param bom 286 * The BOM 287 * @return true if the bytes match the bom, otherwise false 288 */ 289 private boolean matches(final ByteOrderMark bom) { 290 // if (bom.length() != fbLength) { 291 // return false; 292 // } 293 // firstBytes may be bigger than the BOM bytes 294 for (int i = 0; i < bom.length(); i++) { 295 if (bom.get(i) != firstBytes[i]) { 296 return false; 297 } 298 } 299 return true; 300 } 301 302 // ---------------------------------------------------------------------------- 303 // Implementation of InputStream 304 // ---------------------------------------------------------------------------- 305 306 /** 307 * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. 308 * 309 * @return the byte read (excluding BOM) or -1 if the end of stream 310 * @throws IOException 311 * if an I/O error occurs 312 */ 313 @Override 314 public int read() throws IOException { 315 final int b = readFirstBytes(); 316 return b >= 0 ? b : in.read(); 317 } 318 319 /** 320 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. 321 * 322 * @param buf 323 * the buffer to read the bytes into 324 * @param off 325 * The start offset 326 * @param len 327 * The number of bytes to read (excluding BOM) 328 * @return the number of bytes read or -1 if the end of stream 329 * @throws IOException 330 * if an I/O error occurs 331 */ 332 @Override 333 public int read(final byte[] buf, int off, int len) throws IOException { 334 int firstCount = 0; 335 int b = 0; 336 while (len > 0 && b >= 0) { 337 b = readFirstBytes(); 338 if (b >= 0) { 339 buf[off++] = (byte) (b & 0xFF); 340 len--; 341 firstCount++; 342 } 343 } 344 final int secondCount = in.read(buf, off, len); 345 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 346 } 347 348 /** 349 * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. 350 * 351 * @param buf 352 * the buffer to read the bytes into 353 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 354 * @throws IOException 355 * if an I/O error occurs 356 */ 357 @Override 358 public int read(final byte[] buf) throws IOException { 359 return read(buf, 0, buf.length); 360 } 361 362 /** 363 * Invokes the delegate's <code>mark(int)</code> method. 364 * 365 * @param readlimit 366 * read ahead limit 367 */ 368 @Override 369 public synchronized void mark(final int readlimit) { 370 markFbIndex = fbIndex; 371 markedAtStart = firstBytes == null; 372 in.mark(readlimit); 373 } 374 375 /** 376 * Invokes the delegate's <code>reset()</code> method. 377 * 378 * @throws IOException 379 * if an I/O error occurs 380 */ 381 @Override 382 public synchronized void reset() throws IOException { 383 fbIndex = markFbIndex; 384 if (markedAtStart) { 385 firstBytes = null; 386 } 387 388 in.reset(); 389 } 390 391 /** 392 * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM. 393 * 394 * @param n 395 * the number of bytes to skip 396 * @return the number of bytes to skipped or -1 if the end of stream 397 * @throws IOException 398 * if an I/O error occurs 399 */ 400 @Override 401 public long skip(long n) throws IOException { 402 int skipped = 0; 403 while ((n > skipped) && (readFirstBytes() >= 0)) { 404 skipped++; 405 } 406 return in.skip(n - skipped) + skipped; 407 } 408}