001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import java.util.Objects; 027 028import org.apache.commons.io.ByteOrderMark; 029import org.apache.commons.io.IOUtils; 030import org.apache.commons.io.build.AbstractStreamBuilder; 031 032/** 033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 034 * <p> 035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 036 * first byte in the stream. 037 * </p> 038 * <p> 039 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 040 * </p> 041 * <ul> 042 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 043 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 044 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 045 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 046 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 047 * </ul> 048 * 049 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2> 050 * 051 * <pre> 052 * BOMInputStream bomIn = new BOMInputStream(in); 053 * if (bomIn.hasBOM()) { 054 * // has a UTF-8 BOM 055 * } 056 * </pre> 057 * 058 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2> 059 * 060 * <pre> 061 * boolean include = true; 062 * BOMInputStream bomIn = new BOMInputStream(in, include); 063 * if (bomIn.hasBOM()) { 064 * // has a UTF-8 BOM 065 * } 066 * </pre> 067 * 068 * <h2>Example 3 - Detect Multiple BOMs</h2> 069 * 070 * <pre> 071 * BOMInputStream bomIn = new BOMInputStream(in, 072 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 073 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 074 * ); 075 * if (bomIn.hasBOM() == false) { 076 * // No BOM found 077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 078 * // has a UTF-16LE BOM 079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 080 * // has a UTF-16BE BOM 081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 082 * // has a UTF-32LE BOM 083 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 084 * // has a UTF-32BE BOM 085 * } 086 * </pre> 087 * 088 * @see org.apache.commons.io.ByteOrderMark 089 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 090 * @since 2.0 091 */ 092public class BOMInputStream extends ProxyInputStream { 093 094 /** 095 * Builds a new {@link BOMInputStream} instance. 096 * <p> 097 * For example: 098 * </p> 099 * <pre>{@code 100 * BOMInputStream s = BOMInputStream.builder() 101 * .setPath(path) 102 * .setByteOrderMarks(ByteOrderMark.UTF_8) 103 * .setInclude(false) 104 * .get()} 105 * </pre> 106 * <p> 107 * @since 2.12.0 108 */ 109 public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> { 110 111 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 112 113 // for test access 114 static ByteOrderMark getDefaultBOM() { 115 return DEFAULT[0]; 116 } 117 118 private ByteOrderMark[] byteOrderMarks = DEFAULT; 119 120 private boolean include; 121 122 /** 123 * Constructs a new instance. 124 * 125 * @throws UnsupportedOperationException if the origin cannot be converted to an InputStream. 126 */ 127 @SuppressWarnings("resource") 128 @Override 129 public BOMInputStream get() throws IOException { 130 return new BOMInputStream(getOrigin().getInputStream(), include, byteOrderMarks); 131 } 132 133 /** 134 * Sets the ByteOrderMarks to detect and optionally exclude. 135 * 136 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 137 * @return this 138 */ 139 public Builder setByteOrderMarks(final ByteOrderMark[] byteOrderMarks) { 140 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 141 return this; 142 } 143 144 /** 145 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 146 * 147 * @param include true to include the UTF-8 BOM or false to exclude it. return this; 148 * @return this 149 */ 150 public Builder setInclude(final boolean include) { 151 this.include = include; 152 return this; 153 } 154 155 } 156 157 /** 158 * Compares ByteOrderMark objects in descending length order. 159 */ 160 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 161 162 163 /** 164 * Constructs a new {@link Builder}. 165 * 166 * @return a new {@link Builder}. 167 * @since 2.12.0 168 */ 169 public static Builder builder() { 170 return new Builder(); 171 } 172 173 /** 174 * BOMs are sorted from longest to shortest. 175 */ 176 private final List<ByteOrderMark> boms; 177 178 private ByteOrderMark byteOrderMark; 179 private int fbIndex; 180 private int fbLength; 181 private int[] firstBytes; 182 private final boolean include; 183 private boolean markedAtStart; 184 private int markFbIndex; 185 186 /** 187 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 188 * 189 * @param delegate 190 * the InputStream to delegate to 191 * @deprecated Use {@link #builder()} 192 */ 193 @Deprecated 194 public BOMInputStream(final InputStream delegate) { 195 this(delegate, false, Builder.DEFAULT); 196 } 197 198 /** 199 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 200 * 201 * @param delegate 202 * the InputStream to delegate to 203 * @param include 204 * true to include the UTF-8 BOM or false to exclude it 205 * @deprecated Use {@link #builder()} 206 */ 207 @Deprecated 208 public BOMInputStream(final InputStream delegate, final boolean include) { 209 this(delegate, include, Builder.DEFAULT); 210 } 211 212 /** 213 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 214 * 215 * @param delegate 216 * the InputStream to delegate to 217 * @param include 218 * true to include the specified BOMs or false to exclude them 219 * @param boms 220 * The BOMs to detect and optionally exclude 221 * @deprecated Use {@link #builder()} 222 */ 223 @Deprecated 224 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 225 super(delegate); 226 if (IOUtils.length(boms) == 0) { 227 throw new IllegalArgumentException("No BOMs specified"); 228 } 229 this.include = include; 230 final List<ByteOrderMark> list = Arrays.asList(boms); 231 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 232 list.sort(ByteOrderMarkLengthComparator); 233 this.boms = list; 234 235 } 236 237 /** 238 * Constructs a new BOM InputStream that excludes the specified BOMs. 239 * 240 * @param delegate 241 * the InputStream to delegate to 242 * @param boms 243 * The BOMs to detect and exclude 244 * @deprecated Use {@link #builder()} 245 */ 246 @Deprecated 247 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 248 this(delegate, false, boms); 249 } 250 251 /** 252 * Find a BOM with the specified bytes. 253 * 254 * @return The matched BOM or null if none matched 255 */ 256 private ByteOrderMark find() { 257 return boms.stream().filter(this::matches).findFirst().orElse(null); 258 } 259 260 /** 261 * Gets the BOM (Byte Order Mark). 262 * 263 * @return The BOM or null if none 264 * @throws IOException 265 * if an error reading the first bytes of the stream occurs 266 */ 267 public ByteOrderMark getBOM() throws IOException { 268 if (firstBytes == null) { 269 fbLength = 0; 270 // BOMs are sorted from longest to shortest 271 final int maxBomSize = boms.get(0).length(); 272 firstBytes = new int[maxBomSize]; 273 // Read first maxBomSize bytes 274 for (int i = 0; i < firstBytes.length; i++) { 275 firstBytes[i] = in.read(); 276 fbLength++; 277 if (firstBytes[i] < 0) { 278 break; 279 } 280 } 281 // match BOM in firstBytes 282 byteOrderMark = find(); 283 if (byteOrderMark != null && !include) { 284 if (byteOrderMark.length() < firstBytes.length) { 285 fbIndex = byteOrderMark.length(); 286 } else { 287 fbLength = 0; 288 } 289 } 290 } 291 return byteOrderMark; 292 } 293 294 /** 295 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 296 * 297 * @return The BOM charset Name or null if no BOM found 298 * @throws IOException 299 * if an error reading the first bytes of the stream occurs 300 * 301 */ 302 public String getBOMCharsetName() throws IOException { 303 getBOM(); 304 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 305 } 306 307 /** 308 * Tests whether the stream contains one of the specified BOMs. 309 * 310 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 311 * @throws IOException 312 * if an error reading the first bytes of the stream occurs 313 */ 314 public boolean hasBOM() throws IOException { 315 return getBOM() != null; 316 } 317 318 /** 319 * Tests whether the stream contains the specified BOM. 320 * 321 * @param bom 322 * The BOM to check for 323 * @return true if the stream has the specified BOM, otherwise false if it does not 324 * @throws IllegalArgumentException 325 * if the BOM is not one the stream is configured to detect 326 * @throws IOException 327 * if an error reading the first bytes of the stream occurs 328 */ 329 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 330 if (!boms.contains(bom)) { 331 throw new IllegalArgumentException("Stream not configured to detect " + bom); 332 } 333 return Objects.equals(getBOM(), bom); 334 } 335 336 /** 337 * Invokes the delegate's {@code mark(int)} method. 338 * 339 * @param readlimit 340 * read ahead limit 341 */ 342 @Override 343 public synchronized void mark(final int readlimit) { 344 markFbIndex = fbIndex; 345 markedAtStart = firstBytes == null; 346 in.mark(readlimit); 347 } 348 349 /** 350 * Checks if the bytes match a BOM. 351 * 352 * @param bom 353 * The BOM 354 * @return true if the bytes match the bom, otherwise false 355 */ 356 private boolean matches(final ByteOrderMark bom) { 357 // if (bom.length() != fbLength) { 358 // return false; 359 // } 360 // firstBytes may be bigger than the BOM bytes 361 for (int i = 0; i < bom.length(); i++) { 362 if (bom.get(i) != firstBytes[i]) { 363 return false; 364 } 365 } 366 return true; 367 } 368 369 /** 370 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 371 * 372 * @return the byte read (excluding BOM) or -1 if the end of stream 373 * @throws IOException 374 * if an I/O error occurs 375 */ 376 @Override 377 public int read() throws IOException { 378 final int b = readFirstBytes(); 379 return b >= 0 ? b : in.read(); 380 } 381 382 /** 383 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 384 * 385 * @param buf 386 * the buffer to read the bytes into 387 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 388 * @throws IOException 389 * if an I/O error occurs 390 */ 391 @Override 392 public int read(final byte[] buf) throws IOException { 393 return read(buf, 0, buf.length); 394 } 395 396 /** 397 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 398 * 399 * @param buf 400 * the buffer to read the bytes into 401 * @param off 402 * The start offset 403 * @param len 404 * The number of bytes to read (excluding BOM) 405 * @return the number of bytes read or -1 if the end of stream 406 * @throws IOException 407 * if an I/O error occurs 408 */ 409 @Override 410 public int read(final byte[] buf, int off, int len) throws IOException { 411 int firstCount = 0; 412 int b = 0; 413 while (len > 0 && b >= 0) { 414 b = readFirstBytes(); 415 if (b >= 0) { 416 buf[off++] = (byte) (b & 0xFF); 417 len--; 418 firstCount++; 419 } 420 } 421 final int secondCount = in.read(buf, off, len); 422 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 423 } 424 425 /** 426 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 427 * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been 428 * processed already. 429 * 430 * @return the byte read (excluding BOM) or -1 if the end of stream 431 * @throws IOException 432 * if an I/O error occurs 433 */ 434 private int readFirstBytes() throws IOException { 435 getBOM(); 436 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 437 } 438 439 /** 440 * Invokes the delegate's {@code reset()} method. 441 * 442 * @throws IOException 443 * if an I/O error occurs 444 */ 445 @Override 446 public synchronized void reset() throws IOException { 447 fbIndex = markFbIndex; 448 if (markedAtStart) { 449 firstBytes = null; 450 } 451 452 in.reset(); 453 } 454 455 /** 456 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 457 * 458 * @param n 459 * the number of bytes to skip 460 * @return the number of bytes to skipped or -1 if the end of stream 461 * @throws IOException 462 * if an I/O error occurs 463 */ 464 @Override 465 public long skip(final long n) throws IOException { 466 int skipped = 0; 467 while (n > skipped && readFirstBytes() >= 0) { 468 skipped++; 469 } 470 return in.skip(n - skipped) + skipped; 471 } 472}