1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.io.input; 18 19 import static org.apache.commons.io.IOUtils.EOF; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.util.Arrays; 24 import java.util.Comparator; 25 import java.util.List; 26 import java.util.Objects; 27 28 import org.apache.commons.io.ByteOrderMark; 29 import org.apache.commons.io.IOUtils; 30 31 /** 32 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 33 * <p> 34 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 35 * first byte in the stream. 36 * </p> 37 * <p> 38 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 39 * </p> 40 * <ul> 41 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 42 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 43 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 44 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 45 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 46 * </ul> 47 * <p> 48 * To build an instance, use {@link Builder}. 49 * </p> 50 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2> 51 * 52 * <pre> 53 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get(); 54 * if (bomIn.hasBOM()) { 55 * // has a UTF-8 BOM 56 * } 57 * </pre> 58 * 59 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2> 60 * 61 * <pre> 62 * boolean include = true; 63 * BOMInputStream bomIn = BOMInputStream.builder() 64 * .setInputStream(in) 65 * .setInclude(include) 66 * .get(); 67 * if (bomIn.hasBOM()) { 68 * // has a UTF-8 BOM 69 * } 70 * </pre> 71 * 72 * <h2>Example 3 - Detecting Multiple BOMs</h2> 73 * 74 * <pre> 75 * BOMInputStream bomIn = BOMInputStream.builder() 76 * .setInputStream(in) 77 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE) 78 * .get(); 79 * if (bomIn.hasBOM() == false) { 80 * // No BOM found 81 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 82 * // has a UTF-16LE BOM 83 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 84 * // has a UTF-16BE BOM 85 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 86 * // has a UTF-32LE BOM 87 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 88 * // has a UTF-32BE BOM 89 * } 90 * </pre> 91 * <p> 92 * To build an instance, use {@link Builder}. 93 * </p> 94 * <p> 95 * This class is not thread-safe. 96 * </p> 97 * 98 * @see Builder 99 * @see org.apache.commons.io.ByteOrderMark 100 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 101 * @since 2.0 102 */ 103 public class BOMInputStream extends ProxyInputStream { 104 105 // @formatter:off 106 /** 107 * Builds a new {@link BOMInputStream}. 108 * 109 * <h2>Using NIO</h2> 110 * <pre>{@code 111 * BOMInputStream s = BOMInputStream.builder() 112 * .setPath(Paths.get("MyFile.xml")) 113 * .setByteOrderMarks(ByteOrderMark.UTF_8) 114 * .setInclude(false) 115 * .get();} 116 * </pre> 117 * <h2>Using IO</h2> 118 * <pre>{@code 119 * BOMInputStream s = BOMInputStream.builder() 120 * .setFile(new File("MyFile.xml")) 121 * .setByteOrderMarks(ByteOrderMark.UTF_8) 122 * .setInclude(false) 123 * .get();} 124 * </pre> 125 * 126 * @see #get() 127 * @since 2.12.0 128 */ 129 // @formatter:on 130 public static class Builder extends AbstractBuilder<BOMInputStream, Builder> { 131 132 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 133 134 /** 135 * For test access. 136 * 137 * @return the default byte order mark 138 */ 139 static ByteOrderMark getDefaultByteOrderMark() { 140 return DEFAULT[0]; 141 } 142 143 private ByteOrderMark[] byteOrderMarks = DEFAULT; 144 145 private boolean include; 146 147 /** 148 * Constructs a new builder of {@link BOMInputStream}. 149 */ 150 public Builder() { 151 // empty 152 } 153 154 /** 155 * Builds a new {@link BOMInputStream}. 156 * <p> 157 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception. 158 * </p> 159 * <p> 160 * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[]. 161 * </p> 162 * <p> 163 * This builder uses the following aspects: 164 * </p> 165 * <ul> 166 * <li>{@link #getInputStream()}</li> 167 * <li>include}</li> 168 * <li>byteOrderMarks</li> 169 * </ul> 170 * 171 * @return a new instance. 172 * @throws IllegalStateException if the {@code origin} is {@code null}. 173 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 174 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}. 175 * @see #getInputStream() 176 * @see #getUnchecked() 177 */ 178 @Override 179 public BOMInputStream get() throws IOException { 180 return new BOMInputStream(this); 181 } 182 183 /** 184 * Sets the ByteOrderMarks to detect and optionally exclude. 185 * <p> 186 * The default is {@link ByteOrderMark#UTF_8}. 187 * </p> 188 * 189 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 190 * @return {@code this} instance. 191 */ 192 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) { 193 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 194 return this; 195 } 196 197 /** 198 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 199 * <p> 200 * The default is false. 201 * </p> 202 * 203 * @param include true to include the UTF-8 BOM or false to exclude it. return this; 204 * @return {@code this} instance. 205 */ 206 public Builder setInclude(final boolean include) { 207 this.include = include; 208 return this; 209 } 210 211 } 212 213 /** 214 * Compares ByteOrderMark objects in descending length order. 215 */ 216 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 217 218 /** 219 * Constructs a new {@link Builder}. 220 * 221 * @return a new {@link Builder}. 222 * @since 2.12.0 223 */ 224 public static Builder builder() { 225 return new Builder(); 226 } 227 228 /** 229 * BOMs are sorted from longest to shortest. 230 */ 231 private final List<ByteOrderMark> bomList; 232 233 private ByteOrderMark byteOrderMark; 234 private int fbIndex; 235 private int[] firstBytes; 236 private final boolean include; 237 private boolean markedAtStart; 238 private int markFbIndex; 239 240 private BOMInputStream(final Builder builder) throws IOException { 241 super(builder); 242 if (IOUtils.length(builder.byteOrderMarks) == 0) { 243 throw new IllegalArgumentException("No ByteOrderMark specified."); 244 } 245 this.include = builder.include; 246 final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks); 247 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 248 list.sort(ByteOrderMarkLengthComparator); 249 this.bomList = list; 250 } 251 252 /** 253 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 254 * 255 * @param delegate 256 * the InputStream to delegate to 257 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 258 */ 259 @Deprecated 260 public BOMInputStream(final InputStream delegate) { 261 this(delegate, false, Builder.DEFAULT); 262 } 263 264 /** 265 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 266 * 267 * @param delegate 268 * the InputStream to delegate to 269 * @param include 270 * true to include the UTF-8 BOM or false to exclude it 271 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 272 */ 273 @Deprecated 274 public BOMInputStream(final InputStream delegate, final boolean include) { 275 this(delegate, include, Builder.DEFAULT); 276 } 277 278 /** 279 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 280 * 281 * @param delegate 282 * the InputStream to delegate to 283 * @param include 284 * true to include the specified BOMs or false to exclude them 285 * @param boms 286 * The BOMs to detect and optionally exclude 287 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 288 */ 289 @Deprecated 290 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 291 super(delegate); 292 if (IOUtils.length(boms) == 0) { 293 throw new IllegalArgumentException("No BOMs specified"); 294 } 295 this.include = include; 296 final List<ByteOrderMark> list = Arrays.asList(boms); 297 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 298 list.sort(ByteOrderMarkLengthComparator); 299 this.bomList = list; 300 } 301 302 /** 303 * Constructs a new BOM InputStream that excludes the specified BOMs. 304 * 305 * @param delegate 306 * the InputStream to delegate to 307 * @param boms 308 * The BOMs to detect and exclude 309 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 310 */ 311 @Deprecated 312 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 313 this(delegate, false, boms); 314 } 315 316 /** 317 * Finds a ByteOrderMark with the configured bytes in {@code bomList}. 318 * 319 * @return The matched BOM or null if none matched. 320 */ 321 private ByteOrderMark find() { 322 return bomList.stream().filter(this::matches).findFirst().orElse(null); 323 } 324 325 /** 326 * Gets the ByteOrderMark (Byte Order Mark). 327 * 328 * @return The BOM or null if none matched. 329 * @throws IOException 330 * if an error reading the first bytes of the stream occurs. 331 */ 332 public ByteOrderMark getBOM() throws IOException { 333 if (firstBytes == null) { 334 byteOrderMark = readBom(); 335 } 336 return byteOrderMark; 337 } 338 339 /** 340 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 341 * 342 * @return The BOM charset Name or null if no BOM found 343 * @throws IOException 344 * if an error reading the first bytes of the stream occurs 345 */ 346 public String getBOMCharsetName() throws IOException { 347 getBOM(); 348 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 349 } 350 351 /** 352 * Tests whether the stream contains one of the specified BOMs. 353 * 354 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 355 * @throws IOException 356 * if an error reading the first bytes of the stream occurs 357 */ 358 public boolean hasBOM() throws IOException { 359 return getBOM() != null; 360 } 361 362 /** 363 * Tests whether the stream contains the specified BOM. 364 * 365 * @param bom 366 * The BOM to check for 367 * @return true if the stream has the specified BOM, otherwise false if it does not 368 * @throws IllegalArgumentException 369 * if the BOM is not one the stream is configured to detect 370 * @throws IOException 371 * if an error reading the first bytes of the stream occurs 372 */ 373 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 374 if (!bomList.contains(bom)) { 375 throw new IllegalArgumentException("Stream not configured to detect " + bom); 376 } 377 return Objects.equals(getBOM(), bom); 378 } 379 380 /** 381 * Invokes the delegate's {@code mark(int)} method. 382 * 383 * @param readLimit 384 * read ahead limit 385 */ 386 @Override 387 public synchronized void mark(final int readLimit) { 388 markFbIndex = fbIndex; 389 markedAtStart = firstBytes == null; 390 in.mark(readLimit); 391 } 392 393 /** 394 * Checks if the bytes match a BOM. 395 * 396 * @param bom 397 * The BOM 398 * @return true if the bytes match the bom, otherwise false 399 */ 400 private boolean matches(final ByteOrderMark bom) { 401 return bom.matches(firstBytes); 402 } 403 404 /** 405 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 406 * 407 * @return the byte read (excluding BOM) or -1 if the end of stream 408 * @throws IOException 409 * if an I/O error occurs 410 */ 411 @Override 412 public int read() throws IOException { 413 checkOpen(); 414 final int b = readFirstBytes(); 415 return b >= 0 ? b : in.read(); 416 } 417 418 /** 419 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 420 * 421 * @param buf 422 * the buffer to read the bytes into 423 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 424 * @throws IOException 425 * if an I/O error occurs 426 */ 427 @Override 428 public int read(final byte[] buf) throws IOException { 429 return read(buf, 0, buf.length); 430 } 431 432 /** 433 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 434 * 435 * @param buf 436 * the buffer to read the bytes into 437 * @param off 438 * The start offset 439 * @param len 440 * The number of bytes to read (excluding BOM) 441 * @return the number of bytes read or -1 if the end of stream 442 * @throws IOException 443 * if an I/O error occurs 444 */ 445 @Override 446 public int read(final byte[] buf, int off, int len) throws IOException { 447 int firstCount = 0; 448 int b = 0; 449 while (len > 0 && b >= 0) { 450 b = readFirstBytes(); 451 if (b >= 0) { 452 buf[off++] = (byte) (b & 0xFF); 453 len--; 454 firstCount++; 455 } 456 } 457 final int secondCount = in.read(buf, off, len); 458 afterRead(secondCount); 459 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 460 } 461 462 private ByteOrderMark readBom() throws IOException { 463 int fbLength = 0; 464 // BOMs are sorted from longest to shortest 465 final int maxBomSize = bomList.get(0).length(); 466 final int[] tmp = new int[maxBomSize]; 467 // Read first maxBomSize bytes 468 for (int i = 0; i < tmp.length; i++) { 469 tmp[i] = in.read(); 470 afterRead(tmp[i]); 471 fbLength++; 472 if (tmp[i] < 0) { 473 break; 474 } 475 } 476 firstBytes = Arrays.copyOf(tmp, fbLength); 477 // match BOM in firstBytes 478 final ByteOrderMark bom = find(); 479 if (bom != null && !include) { 480 if (bom.length() < firstBytes.length) { 481 fbIndex = bom.length(); 482 } else { 483 firstBytes = new int[0]; 484 } 485 } 486 return bom; 487 } 488 489 /** 490 * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a 491 * valid byte or -1 to indicate that the initial bytes have been processed already. 492 * 493 * @return the byte read (excluding BOM) or -1 if at the end of first bytes. 494 * @throws IOException if an I/O error occurs 495 */ 496 private int readFirstBytes() throws IOException { 497 getBOM(); 498 return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF; 499 } 500 501 /** 502 * Invokes the delegate's {@code reset()} method. 503 * 504 * @throws IOException 505 * if an I/O error occurs 506 */ 507 @Override 508 public synchronized void reset() throws IOException { 509 fbIndex = markFbIndex; 510 if (markedAtStart) { 511 firstBytes = null; 512 } 513 in.reset(); 514 } 515 516 /** 517 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 518 * 519 * @param n 520 * the number of bytes to skip 521 * @return the number of bytes to skipped or -1 if the end of stream 522 * @throws IOException 523 * if an I/O error occurs 524 */ 525 @Override 526 public long skip(final long n) throws IOException { 527 int skipped = 0; 528 while (n > skipped && readFirstBytes() >= 0) { 529 skipped++; 530 } 531 return in.skip(n - skipped) + skipped; 532 } 533 }