001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import java.util.Objects; 027 028import org.apache.commons.io.ByteOrderMark; 029import org.apache.commons.io.IOUtils; 030import org.apache.commons.io.build.AbstractStreamBuilder; 031 032/** 033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 034 * <p> 035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 036 * first byte in the stream. 037 * </p> 038 * <p> 039 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 040 * </p> 041 * <ul> 042 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 043 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 044 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 045 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 046 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 047 * </ul> 048 * <p> 049 * To build an instance, see {@link Builder}. 050 * </p> 051 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2> 052 * 053 * <pre> 054 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get(); 055 * if (bomIn.hasBOM()) { 056 * // has a UTF-8 BOM 057 * } 058 * </pre> 059 * 060 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2> 061 * 062 * <pre> 063 * boolean include = true; 064 * BOMInputStream bomIn = BOMInputStream.builder() 065 * .setInputStream(in) 066 * .setInclude(include) 067 * .get(); 068 * if (bomIn.hasBOM()) { 069 * // has a UTF-8 BOM 070 * } 071 * </pre> 072 * 073 * <h2>Example 3 - Detecting Multiple BOMs</h2> 074 * 075 * <pre> 076 * BOMInputStream bomIn = BOMInputStream.builder() 077 * .setInputStream(in) 078 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE) 079 * .get(); 080 * if (bomIn.hasBOM() == false) { 081 * // No BOM found 082 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 083 * // has a UTF-16LE BOM 084 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 085 * // has a UTF-16BE BOM 086 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 087 * // has a UTF-32LE BOM 088 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 089 * // has a UTF-32BE BOM 090 * } 091 * </pre> 092 * 093 * @see org.apache.commons.io.ByteOrderMark 094 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 095 * @since 2.0 096 */ 097public class BOMInputStream extends ProxyInputStream { 098 099 /** 100 * Builds a new {@link BOMInputStream} instance. 101 * 102 * <h2>Using NIO</h2> 103 * <pre>{@code 104 * BOMInputStream s = BOMInputStream.builder() 105 * .setPath(Paths.get("MyFile.xml")) 106 * .setByteOrderMarks(ByteOrderMark.UTF_8) 107 * .setInclude(false) 108 * .get();} 109 * </pre> 110 * <h2>Using IO</h2> 111 * <pre>{@code 112 * BOMInputStream s = BOMInputStream.builder() 113 * .setFile(new File("MyFile.xml")) 114 * .setByteOrderMarks(ByteOrderMark.UTF_8) 115 * .setInclude(false) 116 * .get();} 117 * </pre> 118 * 119 * @since 2.12.0 120 */ 121 public static class Builder extends AbstractStreamBuilder<BOMInputStream, Builder> { 122 123 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 124 125 /** 126 * For test access. 127 * 128 * @return the default byte order mark 129 */ 130 static ByteOrderMark getDefaultByteOrderMark() { 131 return DEFAULT[0]; 132 } 133 134 private ByteOrderMark[] byteOrderMarks = DEFAULT; 135 136 private boolean include; 137 138 /** 139 * Constructs a new instance. 140 * <p> 141 * This builder use the aspects InputStream, OpenOption[], include, and ByteOrderMark[]. 142 * </p> 143 * <p> 144 * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an 145 * {@link UnsupportedOperationException}. 146 * </p> 147 * 148 * @return a new instance. 149 * @throws UnsupportedOperationException if the origin cannot provide an InputStream. 150 * @see #getInputStream() 151 */ 152 @SuppressWarnings("resource") 153 @Override 154 public BOMInputStream get() throws IOException { 155 return new BOMInputStream(getInputStream(), include, byteOrderMarks); 156 } 157 158 /** 159 * Sets the ByteOrderMarks to detect and optionally exclude. 160 * <p> 161 * The default is {@link ByteOrderMark#UTF_8}. 162 * </p> 163 * 164 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 165 * @return this 166 */ 167 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) { 168 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 169 return this; 170 } 171 172 /** 173 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 174 * <p> 175 * The default is false. 176 * </p> 177 * 178 * @param include true to include the UTF-8 BOM or false to exclude it. return this; 179 * @return this 180 */ 181 public Builder setInclude(final boolean include) { 182 this.include = include; 183 return this; 184 } 185 186 } 187 188 /** 189 * Compares ByteOrderMark objects in descending length order. 190 */ 191 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 192 193 194 /** 195 * Constructs a new {@link Builder}. 196 * 197 * @return a new {@link Builder}. 198 * @since 2.12.0 199 */ 200 public static Builder builder() { 201 return new Builder(); 202 } 203 204 /** 205 * BOMs are sorted from longest to shortest. 206 */ 207 private final List<ByteOrderMark> boms; 208 209 private ByteOrderMark byteOrderMark; 210 private int fbIndex; 211 private int fbLength; 212 private int[] firstBytes; 213 private final boolean include; 214 private boolean markedAtStart; 215 private int markFbIndex; 216 217 /** 218 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 219 * 220 * @param delegate 221 * the InputStream to delegate to 222 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 223 */ 224 @Deprecated 225 public BOMInputStream(final InputStream delegate) { 226 this(delegate, false, Builder.DEFAULT); 227 } 228 229 /** 230 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 231 * 232 * @param delegate 233 * the InputStream to delegate to 234 * @param include 235 * true to include the UTF-8 BOM or false to exclude it 236 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 237 */ 238 @Deprecated 239 public BOMInputStream(final InputStream delegate, final boolean include) { 240 this(delegate, include, Builder.DEFAULT); 241 } 242 243 /** 244 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 245 * 246 * @param delegate 247 * the InputStream to delegate to 248 * @param include 249 * true to include the specified BOMs or false to exclude them 250 * @param boms 251 * The BOMs to detect and optionally exclude 252 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 253 */ 254 @Deprecated 255 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 256 super(delegate); 257 if (IOUtils.length(boms) == 0) { 258 throw new IllegalArgumentException("No BOMs specified"); 259 } 260 this.include = include; 261 final List<ByteOrderMark> list = Arrays.asList(boms); 262 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 263 list.sort(ByteOrderMarkLengthComparator); 264 this.boms = list; 265 266 } 267 268 /** 269 * Constructs a new BOM InputStream that excludes the specified BOMs. 270 * 271 * @param delegate 272 * the InputStream to delegate to 273 * @param boms 274 * The BOMs to detect and exclude 275 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 276 */ 277 @Deprecated 278 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 279 this(delegate, false, boms); 280 } 281 282 /** 283 * Find a BOM with the specified bytes. 284 * 285 * @return The matched BOM or null if none matched 286 */ 287 private ByteOrderMark find() { 288 return boms.stream().filter(this::matches).findFirst().orElse(null); 289 } 290 291 /** 292 * Gets the BOM (Byte Order Mark). 293 * 294 * @return The BOM or null if none 295 * @throws IOException 296 * if an error reading the first bytes of the stream occurs 297 */ 298 public ByteOrderMark getBOM() throws IOException { 299 if (firstBytes == null) { 300 fbLength = 0; 301 // BOMs are sorted from longest to shortest 302 final int maxBomSize = boms.get(0).length(); 303 firstBytes = new int[maxBomSize]; 304 // Read first maxBomSize bytes 305 for (int i = 0; i < firstBytes.length; i++) { 306 firstBytes[i] = in.read(); 307 fbLength++; 308 if (firstBytes[i] < 0) { 309 break; 310 } 311 } 312 // match BOM in firstBytes 313 byteOrderMark = find(); 314 if (byteOrderMark != null && !include) { 315 if (byteOrderMark.length() < firstBytes.length) { 316 fbIndex = byteOrderMark.length(); 317 } else { 318 fbLength = 0; 319 } 320 } 321 } 322 return byteOrderMark; 323 } 324 325 /** 326 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 327 * 328 * @return The BOM charset Name or null if no BOM found 329 * @throws IOException 330 * if an error reading the first bytes of the stream occurs 331 * 332 */ 333 public String getBOMCharsetName() throws IOException { 334 getBOM(); 335 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 336 } 337 338 /** 339 * Tests whether the stream contains one of the specified BOMs. 340 * 341 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 342 * @throws IOException 343 * if an error reading the first bytes of the stream occurs 344 */ 345 public boolean hasBOM() throws IOException { 346 return getBOM() != null; 347 } 348 349 /** 350 * Tests whether the stream contains the specified BOM. 351 * 352 * @param bom 353 * The BOM to check for 354 * @return true if the stream has the specified BOM, otherwise false if it does not 355 * @throws IllegalArgumentException 356 * if the BOM is not one the stream is configured to detect 357 * @throws IOException 358 * if an error reading the first bytes of the stream occurs 359 */ 360 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 361 if (!boms.contains(bom)) { 362 throw new IllegalArgumentException("Stream not configured to detect " + bom); 363 } 364 return Objects.equals(getBOM(), bom); 365 } 366 367 /** 368 * Invokes the delegate's {@code mark(int)} method. 369 * 370 * @param readLimit 371 * read ahead limit 372 */ 373 @Override 374 public synchronized void mark(final int readLimit) { 375 markFbIndex = fbIndex; 376 markedAtStart = firstBytes == null; 377 in.mark(readLimit); 378 } 379 380 /** 381 * Checks if the bytes match a BOM. 382 * 383 * @param bom 384 * The BOM 385 * @return true if the bytes match the bom, otherwise false 386 */ 387 private boolean matches(final ByteOrderMark bom) { 388 // if (bom.length() != fbLength) { 389 // return false; 390 // } 391 // firstBytes may be bigger than the BOM bytes 392 for (int i = 0; i < bom.length(); i++) { 393 if (bom.get(i) != firstBytes[i]) { 394 return false; 395 } 396 } 397 return true; 398 } 399 400 /** 401 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 402 * 403 * @return the byte read (excluding BOM) or -1 if the end of stream 404 * @throws IOException 405 * if an I/O error occurs 406 */ 407 @Override 408 public int read() throws IOException { 409 final int b = readFirstBytes(); 410 return b >= 0 ? b : in.read(); 411 } 412 413 /** 414 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 415 * 416 * @param buf 417 * the buffer to read the bytes into 418 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 419 * @throws IOException 420 * if an I/O error occurs 421 */ 422 @Override 423 public int read(final byte[] buf) throws IOException { 424 return read(buf, 0, buf.length); 425 } 426 427 /** 428 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 429 * 430 * @param buf 431 * the buffer to read the bytes into 432 * @param off 433 * The start offset 434 * @param len 435 * The number of bytes to read (excluding BOM) 436 * @return the number of bytes read or -1 if the end of stream 437 * @throws IOException 438 * if an I/O error occurs 439 */ 440 @Override 441 public int read(final byte[] buf, int off, int len) throws IOException { 442 int firstCount = 0; 443 int b = 0; 444 while (len > 0 && b >= 0) { 445 b = readFirstBytes(); 446 if (b >= 0) { 447 buf[off++] = (byte) (b & 0xFF); 448 len--; 449 firstCount++; 450 } 451 } 452 final int secondCount = in.read(buf, off, len); 453 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 454 } 455 456 /** 457 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 458 * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been 459 * processed already. 460 * 461 * @return the byte read (excluding BOM) or -1 if the end of stream 462 * @throws IOException 463 * if an I/O error occurs 464 */ 465 private int readFirstBytes() throws IOException { 466 getBOM(); 467 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 468 } 469 470 /** 471 * Invokes the delegate's {@code reset()} method. 472 * 473 * @throws IOException 474 * if an I/O error occurs 475 */ 476 @Override 477 public synchronized void reset() throws IOException { 478 fbIndex = markFbIndex; 479 if (markedAtStart) { 480 firstBytes = null; 481 } 482 483 in.reset(); 484 } 485 486 /** 487 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 488 * 489 * @param n 490 * the number of bytes to skip 491 * @return the number of bytes to skipped or -1 if the end of stream 492 * @throws IOException 493 * if an I/O error occurs 494 */ 495 @Override 496 public long skip(final long n) throws IOException { 497 int skipped = 0; 498 while (n > skipped && readFirstBytes() >= 0) { 499 skipped++; 500 } 501 return in.skip(n - skipped) + skipped; 502 } 503}