001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import java.util.Objects; 027 028import org.apache.commons.io.ByteOrderMark; 029import org.apache.commons.io.IOUtils; 030import org.apache.commons.io.build.AbstractStreamBuilder; 031 032/** 033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 034 * <p> 035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 036 * first byte in the stream. 037 * </p> 038 * <p> 039 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 040 * </p> 041 * <ul> 042 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 043 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 044 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 045 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 046 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 047 * </ul> 048 * <p> 049 * To build an instance, use {@link Builder}. 050 * </p> 051 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2> 052 * 053 * <pre> 054 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get(); 055 * if (bomIn.hasBOM()) { 056 * // has a UTF-8 BOM 057 * } 058 * </pre> 059 * 060 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2> 061 * 062 * <pre> 063 * boolean include = true; 064 * BOMInputStream bomIn = BOMInputStream.builder() 065 * .setInputStream(in) 066 * .setInclude(include) 067 * .get(); 068 * if (bomIn.hasBOM()) { 069 * // has a UTF-8 BOM 070 * } 071 * </pre> 072 * 073 * <h2>Example 3 - Detecting Multiple BOMs</h2> 074 * 075 * <pre> 076 * BOMInputStream bomIn = BOMInputStream.builder() 077 * .setInputStream(in) 078 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE) 079 * .get(); 080 * if (bomIn.hasBOM() == false) { 081 * // No BOM found 082 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 083 * // has a UTF-16LE BOM 084 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 085 * // has a UTF-16BE BOM 086 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 087 * // has a UTF-32LE BOM 088 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 089 * // has a UTF-32BE BOM 090 * } 091 * </pre> 092 * <p> 093 * To build an instance, use {@link Builder}. 094 * </p> 095 * 096 * @see Builder 097 * @see org.apache.commons.io.ByteOrderMark 098 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 099 * @since 2.0 100 */ 101public class BOMInputStream extends ProxyInputStream { 102 103 // @formatter:off 104 /** 105 * Builds a new {@link BOMInputStream}. 106 * 107 * <h2>Using NIO</h2> 108 * <pre>{@code 109 * BOMInputStream s = BOMInputStream.builder() 110 * .setPath(Paths.get("MyFile.xml")) 111 * .setByteOrderMarks(ByteOrderMark.UTF_8) 112 * .setInclude(false) 113 * .get();} 114 * </pre> 115 * <h2>Using IO</h2> 116 * <pre>{@code 117 * BOMInputStream s = BOMInputStream.builder() 118 * .setFile(new File("MyFile.xml")) 119 * .setByteOrderMarks(ByteOrderMark.UTF_8) 120 * .setInclude(false) 121 * .get();} 122 * </pre> 123 * 124 * @see #get() 125 * @since 2.12.0 126 */ 127 // @formatter:on 128 public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> { 129 130 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 131 132 /** 133 * For test access. 134 * 135 * @return the default byte order mark 136 */ 137 static ByteOrderMark getDefaultByteOrderMark() { 138 return DEFAULT[0]; 139 } 140 141 private ByteOrderMark[] byteOrderMarks = DEFAULT; 142 143 private boolean include; 144 145 /** 146 * Builds a new {@link BOMInputStream}. 147 * <p> 148 * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception. 149 * </p> 150 * <p> 151 * This builder use the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[]. 152 * </p> 153 * <p> 154 * This builder use the following aspects: 155 * </p> 156 * <ul> 157 * <li>{@link #getInputStream()}</li> 158 * <li>include}</li> 159 * <li>byteOrderMarks</li> 160 * </ul> 161 * 162 * @return a new instance. 163 * @throws IllegalStateException if the {@code origin} is {@code null}. 164 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 165 * @throws IOException if an I/O error occurs. 166 * @see #getInputStream() 167 */ 168 @SuppressWarnings("resource") 169 @Override 170 public BOMInputStream get() throws IOException { 171 return new BOMInputStream(getInputStream(), include, byteOrderMarks); 172 } 173 174 /** 175 * Sets the ByteOrderMarks to detect and optionally exclude. 176 * <p> 177 * The default is {@link ByteOrderMark#UTF_8}. 178 * </p> 179 * 180 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 181 * @return {@code this} instance. 182 */ 183 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) { 184 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 185 return this; 186 } 187 188 /** 189 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 190 * <p> 191 * The default is false. 192 * </p> 193 * 194 * @param include true to include the UTF-8 BOM or false to exclude it. return this; 195 * @return {@code this} instance. 196 */ 197 public Builder setInclude(final boolean include) { 198 this.include = include; 199 return this; 200 } 201 202 } 203 204 /** 205 * Compares ByteOrderMark objects in descending length order. 206 */ 207 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 208 209 /** 210 * Constructs a new {@link Builder}. 211 * 212 * @return a new {@link Builder}. 213 * @since 2.12.0 214 */ 215 public static Builder builder() { 216 return new Builder(); 217 } 218 219 /** 220 * BOMs are sorted from longest to shortest. 221 */ 222 private final List<ByteOrderMark> boms; 223 224 private ByteOrderMark byteOrderMark; 225 private int fbIndex; 226 private int fbLength; 227 private int[] firstBytes; 228 private final boolean include; 229 private boolean markedAtStart; 230 private int markFbIndex; 231 232 /** 233 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 234 * 235 * @param delegate 236 * the InputStream to delegate to 237 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 238 */ 239 @Deprecated 240 public BOMInputStream(final InputStream delegate) { 241 this(delegate, false, Builder.DEFAULT); 242 } 243 244 /** 245 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 246 * 247 * @param delegate 248 * the InputStream to delegate to 249 * @param include 250 * true to include the UTF-8 BOM or false to exclude it 251 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 252 */ 253 @Deprecated 254 public BOMInputStream(final InputStream delegate, final boolean include) { 255 this(delegate, include, Builder.DEFAULT); 256 } 257 258 /** 259 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 260 * 261 * @param delegate 262 * the InputStream to delegate to 263 * @param include 264 * true to include the specified BOMs or false to exclude them 265 * @param boms 266 * The BOMs to detect and optionally exclude 267 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 268 */ 269 @Deprecated 270 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 271 super(delegate); 272 if (IOUtils.length(boms) == 0) { 273 throw new IllegalArgumentException("No BOMs specified"); 274 } 275 this.include = include; 276 final List<ByteOrderMark> list = Arrays.asList(boms); 277 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 278 list.sort(ByteOrderMarkLengthComparator); 279 this.boms = list; 280 } 281 282 /** 283 * Constructs a new BOM InputStream that excludes the specified BOMs. 284 * 285 * @param delegate 286 * the InputStream to delegate to 287 * @param boms 288 * The BOMs to detect and exclude 289 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 290 */ 291 @Deprecated 292 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 293 this(delegate, false, boms); 294 } 295 296 /** 297 * Find a BOM with the specified bytes. 298 * 299 * @return The matched BOM or null if none matched 300 */ 301 private ByteOrderMark find() { 302 return boms.stream().filter(this::matches).findFirst().orElse(null); 303 } 304 305 /** 306 * Gets the BOM (Byte Order Mark). 307 * 308 * @return The BOM or null if none 309 * @throws IOException 310 * if an error reading the first bytes of the stream occurs 311 */ 312 public ByteOrderMark getBOM() throws IOException { 313 if (firstBytes == null) { 314 fbLength = 0; 315 // BOMs are sorted from longest to shortest 316 final int maxBomSize = boms.get(0).length(); 317 firstBytes = new int[maxBomSize]; 318 // Read first maxBomSize bytes 319 for (int i = 0; i < firstBytes.length; i++) { 320 firstBytes[i] = in.read(); 321 fbLength++; 322 if (firstBytes[i] < 0) { 323 break; 324 } 325 } 326 // match BOM in firstBytes 327 byteOrderMark = find(); 328 if (byteOrderMark != null && !include) { 329 if (byteOrderMark.length() < firstBytes.length) { 330 fbIndex = byteOrderMark.length(); 331 } else { 332 fbLength = 0; 333 } 334 } 335 } 336 return byteOrderMark; 337 } 338 339 /** 340 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 341 * 342 * @return The BOM charset Name or null if no BOM found 343 * @throws IOException 344 * if an error reading the first bytes of the stream occurs 345 */ 346 public String getBOMCharsetName() throws IOException { 347 getBOM(); 348 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 349 } 350 351 /** 352 * Tests whether the stream contains one of the specified BOMs. 353 * 354 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 355 * @throws IOException 356 * if an error reading the first bytes of the stream occurs 357 */ 358 public boolean hasBOM() throws IOException { 359 return getBOM() != null; 360 } 361 362 /** 363 * Tests whether the stream contains the specified BOM. 364 * 365 * @param bom 366 * The BOM to check for 367 * @return true if the stream has the specified BOM, otherwise false if it does not 368 * @throws IllegalArgumentException 369 * if the BOM is not one the stream is configured to detect 370 * @throws IOException 371 * if an error reading the first bytes of the stream occurs 372 */ 373 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 374 if (!boms.contains(bom)) { 375 throw new IllegalArgumentException("Stream not configured to detect " + bom); 376 } 377 return Objects.equals(getBOM(), bom); 378 } 379 380 /** 381 * Invokes the delegate's {@code mark(int)} method. 382 * 383 * @param readLimit 384 * read ahead limit 385 */ 386 @Override 387 public synchronized void mark(final int readLimit) { 388 markFbIndex = fbIndex; 389 markedAtStart = firstBytes == null; 390 in.mark(readLimit); 391 } 392 393 /** 394 * Checks if the bytes match a BOM. 395 * 396 * @param bom 397 * The BOM 398 * @return true if the bytes match the bom, otherwise false 399 */ 400 private boolean matches(final ByteOrderMark bom) { 401 // if (bom.length() != fbLength) { 402 // return false; 403 // } 404 // firstBytes may be bigger than the BOM bytes 405 for (int i = 0; i < bom.length(); i++) { 406 if (bom.get(i) != firstBytes[i]) { 407 return false; 408 } 409 } 410 return true; 411 } 412 413 /** 414 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 415 * 416 * @return the byte read (excluding BOM) or -1 if the end of stream 417 * @throws IOException 418 * if an I/O error occurs 419 */ 420 @Override 421 public int read() throws IOException { 422 checkOpen(); 423 final int b = readFirstBytes(); 424 return b >= 0 ? b : in.read(); 425 } 426 427 /** 428 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 429 * 430 * @param buf 431 * the buffer to read the bytes into 432 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 433 * @throws IOException 434 * if an I/O error occurs 435 */ 436 @Override 437 public int read(final byte[] buf) throws IOException { 438 return read(buf, 0, buf.length); 439 } 440 441 /** 442 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 443 * 444 * @param buf 445 * the buffer to read the bytes into 446 * @param off 447 * The start offset 448 * @param len 449 * The number of bytes to read (excluding BOM) 450 * @return the number of bytes read or -1 if the end of stream 451 * @throws IOException 452 * if an I/O error occurs 453 */ 454 @Override 455 public int read(final byte[] buf, int off, int len) throws IOException { 456 int firstCount = 0; 457 int b = 0; 458 while (len > 0 && b >= 0) { 459 b = readFirstBytes(); 460 if (b >= 0) { 461 buf[off++] = (byte) (b & 0xFF); 462 len--; 463 firstCount++; 464 } 465 } 466 final int secondCount = in.read(buf, off, len); 467 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 468 } 469 470 /** 471 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 472 * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been 473 * processed already. 474 * 475 * @return the byte read (excluding BOM) or -1 if the end of stream 476 * @throws IOException 477 * if an I/O error occurs 478 */ 479 private int readFirstBytes() throws IOException { 480 getBOM(); 481 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 482 } 483 484 /** 485 * Invokes the delegate's {@code reset()} method. 486 * 487 * @throws IOException 488 * if an I/O error occurs 489 */ 490 @Override 491 public synchronized void reset() throws IOException { 492 fbIndex = markFbIndex; 493 if (markedAtStart) { 494 firstBytes = null; 495 } 496 497 in.reset(); 498 } 499 500 /** 501 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 502 * 503 * @param n 504 * the number of bytes to skip 505 * @return the number of bytes to skipped or -1 if the end of stream 506 * @throws IOException 507 * if an I/O error occurs 508 */ 509 @Override 510 public long skip(final long n) throws IOException { 511 int skipped = 0; 512 while (n > skipped && readFirstBytes() >= 0) { 513 skipped++; 514 } 515 return in.skip(n - skipped) + skipped; 516 } 517}