001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.io.input; 018 019 import java.io.IOException; 020 import java.io.InputStream; 021 import java.util.Arrays; 022 import java.util.List; 023 024 import org.apache.commons.io.ByteOrderMark; 025 026 /** 027 * This class is used to wrap a stream that includes an encoded 028 * {@link ByteOrderMark} as its first bytes. 029 * 030 * This class detects these bytes and, if required, can automatically skip them 031 * and return the subsequent byte as the first byte in the stream. 032 * 033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 034 * <ul> 035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 038 * </ul> 039 * 040 * 041 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> 042 * <pre> 043 * BOMInputStream bomIn = new BOMInputStream(in); 044 * if (bomIn.hasBOM()) { 045 * // has a UTF-8 BOM 046 * } 047 * </pre> 048 * 049 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> 050 * <pre> 051 * boolean include = true; 052 * BOMInputStream bomIn = new BOMInputStream(in, include); 053 * if (bomIn.hasBOM()) { 054 * // has a UTF-8 BOM 055 * } 056 * </pre> 057 * 058 * <h3>Example 3 - Detect Multiple BOMs</h3> 059 * <pre> 060 * BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); 061 * if (bomIn.hasBOM() == false) { 062 * // No BOM found 063 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 064 * // has a UTF-16LE BOM 065 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 066 * // has a UTF-16BE BOM 067 * } 068 * </pre> 069 * 070 * @see org.apache.commons.io.ByteOrderMark 071 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 072 * @version $Revision: 1052095 $ $Date: 2010-12-22 18:03:20 -0500 (Wed, 22 Dec 2010) $ 073 * @since Commons IO 2.0 074 */ 075 public class BOMInputStream extends ProxyInputStream { 076 private final boolean include; 077 private final List<ByteOrderMark> boms; 078 private ByteOrderMark byteOrderMark; 079 private int[] firstBytes; 080 private int fbLength; 081 private int fbIndex; 082 private int markFbIndex; 083 private boolean markedAtStart; 084 085 /** 086 * Constructs a new BOM InputStream that excludes 087 * a {@link ByteOrderMark#UTF_8} BOM. 088 * @param delegate the InputStream to delegate to 089 */ 090 public BOMInputStream(InputStream delegate) { 091 this(delegate, false, ByteOrderMark.UTF_8); 092 } 093 094 /** 095 * Constructs a new BOM InputStream that detects a 096 * a {@link ByteOrderMark#UTF_8} and optionally includes it. 097 * @param delegate the InputStream to delegate to 098 * @param include true to include the UTF-8 BOM or 099 * false to exclude it 100 */ 101 public BOMInputStream(InputStream delegate, boolean include) { 102 this(delegate, include, ByteOrderMark.UTF_8); 103 } 104 105 /** 106 * Constructs a new BOM InputStream that excludes 107 * the specified BOMs. 108 * @param delegate the InputStream to delegate to 109 * @param boms The BOMs to detect and exclude 110 */ 111 public BOMInputStream(InputStream delegate, ByteOrderMark... boms) { 112 this(delegate, false, boms); 113 } 114 115 /** 116 * Constructs a new BOM InputStream that detects the 117 * specified BOMs and optionally includes them. 118 * @param delegate the InputStream to delegate to 119 * @param include true to include the specified BOMs or 120 * false to exclude them 121 * @param boms The BOMs to detect and optionally exclude 122 */ 123 public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) { 124 super(delegate); 125 if (boms == null || boms.length == 0) { 126 throw new IllegalArgumentException("No BOMs specified"); 127 } 128 this.include = include; 129 this.boms = Arrays.asList(boms); 130 } 131 132 /** 133 * Indicates whether the stream contains one of the specified BOMs. 134 * 135 * @return true if the stream has one of the specified BOMs, otherwise false 136 * if it does not 137 * @throws IOException if an error reading the first bytes of the stream occurs 138 */ 139 public boolean hasBOM() throws IOException { 140 return (getBOM() != null); 141 } 142 143 /** 144 * Indicates whether the stream contains the specified BOM. 145 * 146 * @param bom The BOM to check for 147 * @return true if the stream has the specified BOM, otherwise false 148 * if it does not 149 * @throws IllegalArgumentException if the BOM is not one the stream 150 * is configured to detect 151 * @throws IOException if an error reading the first bytes of the stream occurs 152 */ 153 public boolean hasBOM(ByteOrderMark bom) throws IOException { 154 if (!boms.contains(bom)) { 155 throw new IllegalArgumentException("Stream not configure to detect " + bom); 156 } 157 return (byteOrderMark != null && getBOM().equals(bom)); 158 } 159 160 /** 161 * Return the BOM (Byte Order Mark). 162 * 163 * @return The BOM or null if none 164 * @throws IOException if an error reading the first bytes of the stream occurs 165 */ 166 public ByteOrderMark getBOM() throws IOException { 167 if (firstBytes == null) { 168 int max = 0; 169 for (ByteOrderMark bom : boms) { 170 max = Math.max(max, bom.length()); 171 } 172 firstBytes = new int[max]; 173 for (int i = 0; i < firstBytes.length; i++) { 174 firstBytes[i] = in.read(); 175 fbLength++; 176 if (firstBytes[i] < 0) { 177 break; 178 } 179 180 byteOrderMark = find(); 181 if (byteOrderMark != null) { 182 if (!include) { 183 fbLength = 0; 184 } 185 break; 186 } 187 } 188 } 189 return byteOrderMark; 190 } 191 192 /** 193 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 194 * 195 * @return The BOM charset Name or null if no BOM found 196 * @throws IOException if an error reading the first bytes of the stream occurs 197 * 198 */ 199 public String getBOMCharsetName() throws IOException { 200 getBOM(); 201 return (byteOrderMark == null ? null : byteOrderMark.getCharsetName()); 202 } 203 204 /** 205 * This method reads and either preserves or skips the first bytes in the 206 * stream. It behaves like the single-byte <code>read()</code> method, 207 * either returning a valid byte or -1 to indicate that the initial bytes 208 * have been processed already. 209 * @return the byte read (excluding BOM) or -1 if the end of stream 210 * @throws IOException if an I/O error occurs 211 */ 212 private int readFirstBytes() throws IOException { 213 getBOM(); 214 return (fbIndex < fbLength) ? firstBytes[fbIndex++] : -1; 215 } 216 217 /** 218 * Find a BOM with the specified bytes. 219 * 220 * @return The matched BOM or null if none matched 221 */ 222 private ByteOrderMark find() { 223 for (ByteOrderMark bom : boms) { 224 if (matches(bom)) { 225 return bom; 226 } 227 } 228 return null; 229 } 230 231 /** 232 * Check if the bytes match a BOM. 233 * 234 * @param bom The BOM 235 * @return true if the bytes match the bom, otherwise false 236 */ 237 private boolean matches(ByteOrderMark bom) { 238 if (bom.length() != fbLength) { 239 return false; 240 } 241 for (int i = 0; i < bom.length(); i++) { 242 if (bom.get(i) != firstBytes[i]) { 243 return false; 244 } 245 } 246 return true; 247 } 248 249 //---------------------------------------------------------------------------- 250 // Implementation of InputStream 251 //---------------------------------------------------------------------------- 252 253 /** 254 * Invokes the delegate's <code>read()</code> method, detecting and 255 * optionally skipping BOM. 256 * @return the byte read (excluding BOM) or -1 if the end of stream 257 * @throws IOException if an I/O error occurs 258 */ 259 @Override 260 public int read() throws IOException { 261 int b = readFirstBytes(); 262 return (b >= 0) ? b : in.read(); 263 } 264 265 /** 266 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting 267 * and optionally skipping BOM. 268 * @param buf the buffer to read the bytes into 269 * @param off The start offset 270 * @param len The number of bytes to read (excluding BOM) 271 * @return the number of bytes read or -1 if the end of stream 272 * @throws IOException if an I/O error occurs 273 */ 274 @Override 275 public int read(byte[] buf, int off, int len) throws IOException { 276 int firstCount = 0; 277 int b = 0; 278 while ((len > 0) && (b >= 0)) { 279 b = readFirstBytes(); 280 if (b >= 0) { 281 buf[off++] = (byte) (b & 0xFF); 282 len--; 283 firstCount++; 284 } 285 } 286 int secondCount = in.read(buf, off, len); 287 return (secondCount < 0) ? (firstCount > 0 ? firstCount : -1) : firstCount + secondCount; 288 } 289 290 /** 291 * Invokes the delegate's <code>read(byte[])</code> method, detecting and 292 * optionally skipping BOM. 293 * @param buf the buffer to read the bytes into 294 * @return the number of bytes read (excluding BOM) 295 * or -1 if the end of stream 296 * @throws IOException if an I/O error occurs 297 */ 298 @Override 299 public int read(byte[] buf) throws IOException { 300 return read(buf, 0, buf.length); 301 } 302 303 /** 304 * Invokes the delegate's <code>mark(int)</code> method. 305 * @param readlimit read ahead limit 306 */ 307 @Override 308 public synchronized void mark(int readlimit) { 309 markFbIndex = fbIndex; 310 markedAtStart = (firstBytes == null); 311 in.mark(readlimit); 312 } 313 314 /** 315 * Invokes the delegate's <code>reset()</code> method. 316 * @throws IOException if an I/O error occurs 317 */ 318 @Override 319 public synchronized void reset() throws IOException { 320 fbIndex = markFbIndex; 321 if (markedAtStart) { 322 firstBytes = null; 323 } 324 325 in.reset(); 326 } 327 328 /** 329 * Invokes the delegate's <code>skip(long)</code> method, detecting 330 * and optionallyskipping BOM. 331 * @param n the number of bytes to skip 332 * @return the number of bytes to skipped or -1 if the end of stream 333 * @throws IOException if an I/O error occurs 334 */ 335 @Override 336 public long skip(long n) throws IOException { 337 while ((n > 0) && (readFirstBytes() >= 0)) { 338 n--; 339 } 340 return in.skip(n); 341 } 342 }