001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.io.input; 018 019 import java.io.IOException; 020 import java.io.InputStream; 021 import java.util.Arrays; 022 import java.util.Comparator; 023 import java.util.List; 024 025 import org.apache.commons.io.ByteOrderMark; 026 027 /** 028 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 029 * 030 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 031 * first byte in the stream. 032 * 033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 034 * <ul> 035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 038 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 039 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 040 * </ul> 041 * 042 * 043 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> 044 * 045 * <pre> 046 * BOMInputStream bomIn = new BOMInputStream(in); 047 * if (bomIn.hasBOM()) { 048 * // has a UTF-8 BOM 049 * } 050 * </pre> 051 * 052 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> 053 * 054 * <pre> 055 * boolean include = true; 056 * BOMInputStream bomIn = new BOMInputStream(in, include); 057 * if (bomIn.hasBOM()) { 058 * // has a UTF-8 BOM 059 * } 060 * </pre> 061 * 062 * <h3>Example 3 - Detect Multiple BOMs</h3> 063 * 064 * <pre> 065 * BOMInputStream bomIn = new BOMInputStream(in, 066 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 067 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 068 * ); 069 * if (bomIn.hasBOM() == false) { 070 * // No BOM found 071 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 072 * // has a UTF-16LE BOM 073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 074 * // has a UTF-16BE BOM 075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 076 * // has a UTF-32LE BOM 077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 078 * // has a UTF-32BE BOM 079 * } 080 * </pre> 081 * 082 * @see org.apache.commons.io.ByteOrderMark 083 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 084 * @version $Id: BOMInputStream.java 1346400 2012-06-05 14:48:01Z ggregory $ 085 * @since 2.0 086 */ 087 public class BOMInputStream extends ProxyInputStream { 088 private final boolean include; 089 /** 090 * BOMs are sorted from longest to shortest. 091 */ 092 private final List<ByteOrderMark> boms; 093 private ByteOrderMark byteOrderMark; 094 private int[] firstBytes; 095 private int fbLength; 096 private int fbIndex; 097 private int markFbIndex; 098 private boolean markedAtStart; 099 100 /** 101 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 102 * 103 * @param delegate 104 * the InputStream to delegate to 105 */ 106 public BOMInputStream(InputStream delegate) { 107 this(delegate, false, ByteOrderMark.UTF_8); 108 } 109 110 /** 111 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. 112 * 113 * @param delegate 114 * the InputStream to delegate to 115 * @param include 116 * true to include the UTF-8 BOM or false to exclude it 117 */ 118 public BOMInputStream(InputStream delegate, boolean include) { 119 this(delegate, include, ByteOrderMark.UTF_8); 120 } 121 122 /** 123 * Constructs a new BOM InputStream that excludes the specified BOMs. 124 * 125 * @param delegate 126 * the InputStream to delegate to 127 * @param boms 128 * The BOMs to detect and exclude 129 */ 130 public BOMInputStream(InputStream delegate, ByteOrderMark... boms) { 131 this(delegate, false, boms); 132 } 133 134 /** 135 * Compares ByteOrderMark objects in descending length order. 136 */ 137 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() { 138 139 public int compare(ByteOrderMark bom1, ByteOrderMark bom2) { 140 int len1 = bom1.length(); 141 int len2 = bom2.length(); 142 if (len1 > len2) { 143 return -1; 144 } 145 if (len2 > len1) { 146 return 1; 147 } 148 return 0; 149 } 150 }; 151 152 /** 153 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 154 * 155 * @param delegate 156 * the InputStream to delegate to 157 * @param include 158 * true to include the specified BOMs or false to exclude them 159 * @param boms 160 * The BOMs to detect and optionally exclude 161 */ 162 public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) { 163 super(delegate); 164 if (boms == null || boms.length == 0) { 165 throw new IllegalArgumentException("No BOMs specified"); 166 } 167 this.include = include; 168 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 169 Arrays.sort(boms, ByteOrderMarkLengthComparator); 170 this.boms = Arrays.asList(boms); 171 172 } 173 174 /** 175 * Indicates whether the stream contains one of the specified BOMs. 176 * 177 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 178 * @throws IOException 179 * if an error reading the first bytes of the stream occurs 180 */ 181 public boolean hasBOM() throws IOException { 182 return getBOM() != null; 183 } 184 185 /** 186 * Indicates whether the stream contains the specified BOM. 187 * 188 * @param bom 189 * The BOM to check for 190 * @return true if the stream has the specified BOM, otherwise false if it does not 191 * @throws IllegalArgumentException 192 * if the BOM is not one the stream is configured to detect 193 * @throws IOException 194 * if an error reading the first bytes of the stream occurs 195 */ 196 public boolean hasBOM(ByteOrderMark bom) throws IOException { 197 if (!boms.contains(bom)) { 198 throw new IllegalArgumentException("Stream not configure to detect " + bom); 199 } 200 return byteOrderMark != null && getBOM().equals(bom); 201 } 202 203 /** 204 * Return the BOM (Byte Order Mark). 205 * 206 * @return The BOM or null if none 207 * @throws IOException 208 * if an error reading the first bytes of the stream occurs 209 */ 210 public ByteOrderMark getBOM() throws IOException { 211 if (firstBytes == null) { 212 fbLength = 0; 213 // BOMs are sorted from longest to shortest 214 final int maxBomSize = boms.get(0).length(); 215 firstBytes = new int[maxBomSize]; 216 // Read first maxBomSize bytes 217 for (int i = 0; i < firstBytes.length; i++) { 218 firstBytes[i] = in.read(); 219 fbLength++; 220 if (firstBytes[i] < 0) { 221 break; 222 } 223 } 224 // match BOM in firstBytes 225 byteOrderMark = find(); 226 if (byteOrderMark != null) { 227 if (!include) { 228 if (byteOrderMark.length() < firstBytes.length) { 229 fbIndex = byteOrderMark.length(); 230 } else { 231 fbLength = 0; 232 } 233 } 234 } 235 } 236 return byteOrderMark; 237 } 238 239 /** 240 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 241 * 242 * @return The BOM charset Name or null if no BOM found 243 * @throws IOException 244 * if an error reading the first bytes of the stream occurs 245 * 246 */ 247 public String getBOMCharsetName() throws IOException { 248 getBOM(); 249 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 250 } 251 252 /** 253 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 254 * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been 255 * processed already. 256 * 257 * @return the byte read (excluding BOM) or -1 if the end of stream 258 * @throws IOException 259 * if an I/O error occurs 260 */ 261 private int readFirstBytes() throws IOException { 262 getBOM(); 263 return fbIndex < fbLength ? firstBytes[fbIndex++] : -1; 264 } 265 266 /** 267 * Find a BOM with the specified bytes. 268 * 269 * @return The matched BOM or null if none matched 270 */ 271 private ByteOrderMark find() { 272 for (ByteOrderMark bom : boms) { 273 if (matches(bom)) { 274 return bom; 275 } 276 } 277 return null; 278 } 279 280 /** 281 * Check if the bytes match a BOM. 282 * 283 * @param bom 284 * The BOM 285 * @return true if the bytes match the bom, otherwise false 286 */ 287 private boolean matches(ByteOrderMark bom) { 288 // if (bom.length() != fbLength) { 289 // return false; 290 // } 291 // firstBytes may be bigger than the BOM bytes 292 for (int i = 0; i < bom.length(); i++) { 293 if (bom.get(i) != firstBytes[i]) { 294 return false; 295 } 296 } 297 return true; 298 } 299 300 // ---------------------------------------------------------------------------- 301 // Implementation of InputStream 302 // ---------------------------------------------------------------------------- 303 304 /** 305 * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. 306 * 307 * @return the byte read (excluding BOM) or -1 if the end of stream 308 * @throws IOException 309 * if an I/O error occurs 310 */ 311 @Override 312 public int read() throws IOException { 313 int b = readFirstBytes(); 314 return b >= 0 ? b : in.read(); 315 } 316 317 /** 318 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. 319 * 320 * @param buf 321 * the buffer to read the bytes into 322 * @param off 323 * The start offset 324 * @param len 325 * The number of bytes to read (excluding BOM) 326 * @return the number of bytes read or -1 if the end of stream 327 * @throws IOException 328 * if an I/O error occurs 329 */ 330 @Override 331 public int read(byte[] buf, int off, int len) throws IOException { 332 int firstCount = 0; 333 int b = 0; 334 while (len > 0 && b >= 0) { 335 b = readFirstBytes(); 336 if (b >= 0) { 337 buf[off++] = (byte) (b & 0xFF); 338 len--; 339 firstCount++; 340 } 341 } 342 int secondCount = in.read(buf, off, len); 343 return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount; 344 } 345 346 /** 347 * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. 348 * 349 * @param buf 350 * the buffer to read the bytes into 351 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 352 * @throws IOException 353 * if an I/O error occurs 354 */ 355 @Override 356 public int read(byte[] buf) throws IOException { 357 return read(buf, 0, buf.length); 358 } 359 360 /** 361 * Invokes the delegate's <code>mark(int)</code> method. 362 * 363 * @param readlimit 364 * read ahead limit 365 */ 366 @Override 367 public synchronized void mark(int readlimit) { 368 markFbIndex = fbIndex; 369 markedAtStart = firstBytes == null; 370 in.mark(readlimit); 371 } 372 373 /** 374 * Invokes the delegate's <code>reset()</code> method. 375 * 376 * @throws IOException 377 * if an I/O error occurs 378 */ 379 @Override 380 public synchronized void reset() throws IOException { 381 fbIndex = markFbIndex; 382 if (markedAtStart) { 383 firstBytes = null; 384 } 385 386 in.reset(); 387 } 388 389 /** 390 * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM. 391 * 392 * @param n 393 * the number of bytes to skip 394 * @return the number of bytes to skipped or -1 if the end of stream 395 * @throws IOException 396 * if an I/O error occurs 397 */ 398 @Override 399 public long skip(long n) throws IOException { 400 while (n > 0 && readFirstBytes() >= 0) { 401 n--; 402 } 403 return in.skip(n); 404 } 405 }