| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| BOMInputStream |
|
| 2.8333333333333335;2.833 | ||||
| BOMInputStream$1 |
|
| 2.8333333333333335;2.833 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | package org.apache.commons.io.input; | |
| 18 | ||
| 19 | import java.io.IOException; | |
| 20 | import java.io.InputStream; | |
| 21 | import java.util.Arrays; | |
| 22 | import java.util.Comparator; | |
| 23 | import java.util.List; | |
| 24 | ||
| 25 | import org.apache.commons.io.ByteOrderMark; | |
| 26 | ||
| 27 | /** | |
| 28 | * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. | |
| 29 | * | |
| 30 | * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the | |
| 31 | * first byte in the stream. | |
| 32 | * | |
| 33 | * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: | |
| 34 | * <ul> | |
| 35 | * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> | |
| 36 | * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> | |
| 37 | * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> | |
| 38 | * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> | |
| 39 | * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> | |
| 40 | * </ul> | |
| 41 | * | |
| 42 | * | |
| 43 | * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> | |
| 44 | * | |
| 45 | * <pre> | |
| 46 | * BOMInputStream bomIn = new BOMInputStream(in); | |
| 47 | * if (bomIn.hasBOM()) { | |
| 48 | * // has a UTF-8 BOM | |
| 49 | * } | |
| 50 | * </pre> | |
| 51 | * | |
| 52 | * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> | |
| 53 | * | |
| 54 | * <pre> | |
| 55 | * boolean include = true; | |
| 56 | * BOMInputStream bomIn = new BOMInputStream(in, include); | |
| 57 | * if (bomIn.hasBOM()) { | |
| 58 | * // has a UTF-8 BOM | |
| 59 | * } | |
| 60 | * </pre> | |
| 61 | * | |
| 62 | * <h3>Example 3 - Detect Multiple BOMs</h3> | |
| 63 | * | |
| 64 | * <pre> | |
| 65 | * BOMInputStream bomIn = new BOMInputStream(in, | |
| 66 | * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, | |
| 67 | * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE | |
| 68 | * ); | |
| 69 | * if (bomIn.hasBOM() == false) { | |
| 70 | * // No BOM found | |
| 71 | * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { | |
| 72 | * // has a UTF-16LE BOM | |
| 73 | * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { | |
| 74 | * // has a UTF-16BE BOM | |
| 75 | * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { | |
| 76 | * // has a UTF-32LE BOM | |
| 77 | * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { | |
| 78 | * // has a UTF-32BE BOM | |
| 79 | * } | |
| 80 | * </pre> | |
| 81 | * | |
| 82 | * @see org.apache.commons.io.ByteOrderMark | |
| 83 | * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> | |
| 84 | * @version $Id: BOMInputStream.java 1415850 2012-11-30 20:51:39Z ggregory $ | |
| 85 | * @since 2.0 | |
| 86 | */ | |
| 87 | public class BOMInputStream extends ProxyInputStream { | |
| 88 | private final boolean include; | |
| 89 | /** | |
| 90 | * BOMs are sorted from longest to shortest. | |
| 91 | */ | |
| 92 | private final List<ByteOrderMark> boms; | |
| 93 | private ByteOrderMark byteOrderMark; | |
| 94 | private int[] firstBytes; | |
| 95 | private int fbLength; | |
| 96 | private int fbIndex; | |
| 97 | private int markFbIndex; | |
| 98 | private boolean markedAtStart; | |
| 99 | ||
| 100 | /** | |
| 101 | * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. | |
| 102 | * | |
| 103 | * @param delegate | |
| 104 | * the InputStream to delegate to | |
| 105 | */ | |
| 106 | public BOMInputStream(final InputStream delegate) { | |
| 107 | 26 | this(delegate, false, ByteOrderMark.UTF_8); |
| 108 | 26 | } |
| 109 | ||
| 110 | /** | |
| 111 | * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. | |
| 112 | * | |
| 113 | * @param delegate | |
| 114 | * the InputStream to delegate to | |
| 115 | * @param include | |
| 116 | * true to include the UTF-8 BOM or false to exclude it | |
| 117 | */ | |
| 118 | public BOMInputStream(final InputStream delegate, final boolean include) { | |
| 119 | 2 | this(delegate, include, ByteOrderMark.UTF_8); |
| 120 | 2 | } |
| 121 | ||
| 122 | /** | |
| 123 | * Constructs a new BOM InputStream that excludes the specified BOMs. | |
| 124 | * | |
| 125 | * @param delegate | |
| 126 | * the InputStream to delegate to | |
| 127 | * @param boms | |
| 128 | * The BOMs to detect and exclude | |
| 129 | */ | |
| 130 | public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { | |
| 131 | 11 | this(delegate, false, boms); |
| 132 | 11 | } |
| 133 | ||
| 134 | /** | |
| 135 | * Compares ByteOrderMark objects in descending length order. | |
| 136 | */ | |
| 137 | 2290 | private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() { |
| 138 | ||
| 139 | public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) { | |
| 140 | 2287 | final int len1 = bom1.length(); |
| 141 | 2287 | final int len2 = bom2.length(); |
| 142 | 2287 | if (len1 > len2) { |
| 143 | 31 | return -1; |
| 144 | } | |
| 145 | 2256 | if (len2 > len1) { |
| 146 | 747 | return 1; |
| 147 | } | |
| 148 | 1509 | return 0; |
| 149 | } | |
| 150 | }; | |
| 151 | ||
| 152 | /** | |
| 153 | * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. | |
| 154 | * | |
| 155 | * @param delegate | |
| 156 | * the InputStream to delegate to | |
| 157 | * @param include | |
| 158 | * true to include the specified BOMs or false to exclude them | |
| 159 | * @param boms | |
| 160 | * The BOMs to detect and optionally exclude | |
| 161 | */ | |
| 162 | public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { | |
| 163 | 543 | super(delegate); |
| 164 | 543 | if (boms == null || boms.length == 0) { |
| 165 | 2 | throw new IllegalArgumentException("No BOMs specified"); |
| 166 | } | |
| 167 | 541 | this.include = include; |
| 168 | // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. | |
| 169 | 541 | Arrays.sort(boms, ByteOrderMarkLengthComparator); |
| 170 | 541 | this.boms = Arrays.asList(boms); |
| 171 | ||
| 172 | 541 | } |
| 173 | ||
| 174 | /** | |
| 175 | * Indicates whether the stream contains one of the specified BOMs. | |
| 176 | * | |
| 177 | * @return true if the stream has one of the specified BOMs, otherwise false if it does not | |
| 178 | * @throws IOException | |
| 179 | * if an error reading the first bytes of the stream occurs | |
| 180 | */ | |
| 181 | public boolean hasBOM() throws IOException { | |
| 182 | 12 | return getBOM() != null; |
| 183 | } | |
| 184 | ||
| 185 | /** | |
| 186 | * Indicates whether the stream contains the specified BOM. | |
| 187 | * | |
| 188 | * @param bom | |
| 189 | * The BOM to check for | |
| 190 | * @return true if the stream has the specified BOM, otherwise false if it does not | |
| 191 | * @throws IllegalArgumentException | |
| 192 | * if the BOM is not one the stream is configured to detect | |
| 193 | * @throws IOException | |
| 194 | * if an error reading the first bytes of the stream occurs | |
| 195 | */ | |
| 196 | public boolean hasBOM(final ByteOrderMark bom) throws IOException { | |
| 197 | 18 | if (!boms.contains(bom)) { |
| 198 | 5 | throw new IllegalArgumentException("Stream not configure to detect " + bom); |
| 199 | } | |
| 200 | 13 | return byteOrderMark != null && getBOM().equals(bom); |
| 201 | } | |
| 202 | ||
| 203 | /** | |
| 204 | * Return the BOM (Byte Order Mark). | |
| 205 | * | |
| 206 | * @return The BOM or null if none | |
| 207 | * @throws IOException | |
| 208 | * if an error reading the first bytes of the stream occurs | |
| 209 | */ | |
| 210 | public ByteOrderMark getBOM() throws IOException { | |
| 211 | 6223 | if (firstBytes == null) { |
| 212 | 542 | fbLength = 0; |
| 213 | // BOMs are sorted from longest to shortest | |
| 214 | 542 | final int maxBomSize = boms.get(0).length(); |
| 215 | 542 | firstBytes = new int[maxBomSize]; |
| 216 | // Read first maxBomSize bytes | |
| 217 | 3454 | for (int i = 0; i < firstBytes.length; i++) { |
| 218 | 3137 | firstBytes[i] = in.read(); |
| 219 | 3137 | fbLength++; |
| 220 | 3137 | if (firstBytes[i] < 0) { |
| 221 | 225 | break; |
| 222 | } | |
| 223 | } | |
| 224 | // match BOM in firstBytes | |
| 225 | 542 | byteOrderMark = find(); |
| 226 | 542 | if (byteOrderMark != null) { |
| 227 | 213 | if (!include) { |
| 228 | 95 | if (byteOrderMark.length() < firstBytes.length) { |
| 229 | 48 | fbIndex = byteOrderMark.length(); |
| 230 | } else { | |
| 231 | 47 | fbLength = 0; |
| 232 | } | |
| 233 | } | |
| 234 | } | |
| 235 | } | |
| 236 | 6223 | return byteOrderMark; |
| 237 | } | |
| 238 | ||
| 239 | /** | |
| 240 | * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. | |
| 241 | * | |
| 242 | * @return The BOM charset Name or null if no BOM found | |
| 243 | * @throws IOException | |
| 244 | * if an error reading the first bytes of the stream occurs | |
| 245 | * | |
| 246 | */ | |
| 247 | public String getBOMCharsetName() throws IOException { | |
| 248 | 502 | getBOM(); |
| 249 | 502 | return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); |
| 250 | } | |
| 251 | ||
| 252 | /** | |
| 253 | * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte | |
| 254 | * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been | |
| 255 | * processed already. | |
| 256 | * | |
| 257 | * @return the byte read (excluding BOM) or -1 if the end of stream | |
| 258 | * @throws IOException | |
| 259 | * if an I/O error occurs | |
| 260 | */ | |
| 261 | private int readFirstBytes() throws IOException { | |
| 262 | 5687 | getBOM(); |
| 263 | 5687 | return fbIndex < fbLength ? firstBytes[fbIndex++] : -1; |
| 264 | } | |
| 265 | ||
| 266 | /** | |
| 267 | * Find a BOM with the specified bytes. | |
| 268 | * | |
| 269 | * @return The matched BOM or null if none matched | |
| 270 | */ | |
| 271 | private ByteOrderMark find() { | |
| 272 | 542 | for (final ByteOrderMark bom : boms) { |
| 273 | 2288 | if (matches(bom)) { |
| 274 | 213 | return bom; |
| 275 | } | |
| 276 | 2075 | } |
| 277 | 329 | return null; |
| 278 | } | |
| 279 | ||
| 280 | /** | |
| 281 | * Check if the bytes match a BOM. | |
| 282 | * | |
| 283 | * @param bom | |
| 284 | * The BOM | |
| 285 | * @return true if the bytes match the bom, otherwise false | |
| 286 | */ | |
| 287 | private boolean matches(final ByteOrderMark bom) { | |
| 288 | // if (bom.length() != fbLength) { | |
| 289 | // return false; | |
| 290 | // } | |
| 291 | // firstBytes may be bigger than the BOM bytes | |
| 292 | 3721 | for (int i = 0; i < bom.length(); i++) { |
| 293 | 3508 | if (bom.get(i) != firstBytes[i]) { |
| 294 | 2075 | return false; |
| 295 | } | |
| 296 | } | |
| 297 | 213 | return true; |
| 298 | } | |
| 299 | ||
| 300 | // ---------------------------------------------------------------------------- | |
| 301 | // Implementation of InputStream | |
| 302 | // ---------------------------------------------------------------------------- | |
| 303 | ||
| 304 | /** | |
| 305 | * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. | |
| 306 | * | |
| 307 | * @return the byte read (excluding BOM) or -1 if the end of stream | |
| 308 | * @throws IOException | |
| 309 | * if an I/O error occurs | |
| 310 | */ | |
| 311 | @Override | |
| 312 | public int read() throws IOException { | |
| 313 | 3175 | final int b = readFirstBytes(); |
| 314 | 3175 | return b >= 0 ? b : in.read(); |
| 315 | } | |
| 316 | ||
| 317 | /** | |
| 318 | * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. | |
| 319 | * | |
| 320 | * @param buf | |
| 321 | * the buffer to read the bytes into | |
| 322 | * @param off | |
| 323 | * The start offset | |
| 324 | * @param len | |
| 325 | * The number of bytes to read (excluding BOM) | |
| 326 | * @return the number of bytes read or -1 if the end of stream | |
| 327 | * @throws IOException | |
| 328 | * if an I/O error occurs | |
| 329 | */ | |
| 330 | @Override | |
| 331 | public int read(final byte[] buf, int off, int len) throws IOException { | |
| 332 | 797 | int firstCount = 0; |
| 333 | 797 | int b = 0; |
| 334 | 3306 | while (len > 0 && b >= 0) { |
| 335 | 2509 | b = readFirstBytes(); |
| 336 | 2509 | if (b >= 0) { |
| 337 | 1935 | buf[off++] = (byte) (b & 0xFF); |
| 338 | 1935 | len--; |
| 339 | 1935 | firstCount++; |
| 340 | } | |
| 341 | } | |
| 342 | 797 | final int secondCount = in.read(buf, off, len); |
| 343 | 797 | return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount; |
| 344 | } | |
| 345 | ||
| 346 | /** | |
| 347 | * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. | |
| 348 | * | |
| 349 | * @param buf | |
| 350 | * the buffer to read the bytes into | |
| 351 | * @return the number of bytes read (excluding BOM) or -1 if the end of stream | |
| 352 | * @throws IOException | |
| 353 | * if an I/O error occurs | |
| 354 | */ | |
| 355 | @Override | |
| 356 | public int read(final byte[] buf) throws IOException { | |
| 357 | 289 | return read(buf, 0, buf.length); |
| 358 | } | |
| 359 | ||
| 360 | /** | |
| 361 | * Invokes the delegate's <code>mark(int)</code> method. | |
| 362 | * | |
| 363 | * @param readlimit | |
| 364 | * read ahead limit | |
| 365 | */ | |
| 366 | @Override | |
| 367 | public synchronized void mark(final int readlimit) { | |
| 368 | 238 | markFbIndex = fbIndex; |
| 369 | 238 | markedAtStart = firstBytes == null; |
| 370 | 238 | in.mark(readlimit); |
| 371 | 238 | } |
| 372 | ||
| 373 | /** | |
| 374 | * Invokes the delegate's <code>reset()</code> method. | |
| 375 | * | |
| 376 | * @throws IOException | |
| 377 | * if an I/O error occurs | |
| 378 | */ | |
| 379 | @Override | |
| 380 | public synchronized void reset() throws IOException { | |
| 381 | 238 | fbIndex = markFbIndex; |
| 382 | 238 | if (markedAtStart) { |
| 383 | 4 | firstBytes = null; |
| 384 | } | |
| 385 | ||
| 386 | 238 | in.reset(); |
| 387 | 238 | } |
| 388 | ||
| 389 | /** | |
| 390 | * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM. | |
| 391 | * | |
| 392 | * @param n | |
| 393 | * the number of bytes to skip | |
| 394 | * @return the number of bytes to skipped or -1 if the end of stream | |
| 395 | * @throws IOException | |
| 396 | * if an I/O error occurs | |
| 397 | */ | |
| 398 | @Override | |
| 399 | public long skip(long n) throws IOException { | |
| 400 | 4 | while (n > 0 && readFirstBytes() >= 0) { |
| 401 | 2 | n--; |
| 402 | } | |
| 403 | 2 | return in.skip(n); |
| 404 | } | |
| 405 | } |