001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.io.input; 018 019 import java.io.IOException; 020 import java.io.InputStream; 021 import java.util.Arrays; 022 import java.util.List; 023 024 import org.apache.commons.io.ByteOrderMark; 025 026 /** 027 * This class is used to wrap a stream that includes an encoded 028 * {@link ByteOrderMark} as its first bytes. 029 * 030 * This class detects these bytes and, if required, can automatically skip them 031 * and return the subsequent byte as the first byte in the stream. 032 * 033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 034 * <ul> 035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 038 * </ul> 039 * 040 * 041 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> 042 * <pre> 043 * BOMInputStream bomIn = new BOMInputStream(in); 044 * if (bomIn.hasBOM()) { 045 * // has a UTF-8 BOM 046 * } 047 * </pre> 048 * 049 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> 050 * <pre> 051 * boolean include = true; 052 * BOMInputStream bomIn = new BOMInputStream(in, include); 053 * if (bomIn.hasBOM()) { 054 * // has a UTF-8 BOM 055 * } 056 * </pre> 057 * 058 * <h3>Example 3 - Detect Multiple BOMs</h3> 059 * <pre> 060 * BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); 061 * if (bomIn.hasBOM() == false) { 062 * // No BOM found 063 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 064 * // has a UTF-16LE BOM 065 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 066 * // has a UTF-16BE BOM 067 * } 068 * </pre> 069 * 070 * @see org.apache.commons.io.ByteOrderMark 071 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 072 * @version $Id: BOMInputStream.java 1304052 2012-03-22 20:55:29Z ggregory $ 073 * @since 2.0 074 */ 075 public class BOMInputStream extends ProxyInputStream { 076 private final boolean include; 077 private final List<ByteOrderMark> boms; 078 private ByteOrderMark byteOrderMark; 079 private int[] firstBytes; 080 private int fbLength; 081 private int fbIndex; 082 private int markFbIndex; 083 private boolean markedAtStart; 084 085 /** 086 * Constructs a new BOM InputStream that excludes 087 * a {@link ByteOrderMark#UTF_8} BOM. 088 * @param delegate the InputStream to delegate to 089 */ 090 public BOMInputStream(InputStream delegate) { 091 this(delegate, false, ByteOrderMark.UTF_8); 092 } 093 094 /** 095 * Constructs a new BOM InputStream that detects a 096 * a {@link ByteOrderMark#UTF_8} and optionally includes it. 097 * @param delegate the InputStream to delegate to 098 * @param include true to include the UTF-8 BOM or 099 * false to exclude it 100 */ 101 public BOMInputStream(InputStream delegate, boolean include) { 102 this(delegate, include, ByteOrderMark.UTF_8); 103 } 104 105 /** 106 * Constructs a new BOM InputStream that excludes 107 * the specified BOMs. 108 * @param delegate the InputStream to delegate to 109 * @param boms The BOMs to detect and exclude 110 */ 111 public BOMInputStream(InputStream delegate, ByteOrderMark... boms) { 112 this(delegate, false, boms); 113 } 114 115 /** 116 * Constructs a new BOM InputStream that detects the 117 * specified BOMs and optionally includes them. 118 * @param delegate the InputStream to delegate to 119 * @param include true to include the specified BOMs or 120 * false to exclude them 121 * @param boms The BOMs to detect and optionally exclude 122 */ 123 public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) { 124 super(delegate); 125 if (boms == null || boms.length == 0) { 126 throw new IllegalArgumentException("No BOMs specified"); 127 } 128 this.include = include; 129 this.boms = Arrays.asList(boms); 130 } 131 132 /** 133 * Indicates whether the stream contains one of the specified BOMs. 134 * 135 * @return true if the stream has one of the specified BOMs, otherwise false 136 * if it does not 137 * @throws IOException if an error reading the first bytes of the stream occurs 138 */ 139 public boolean hasBOM() throws IOException { 140 return getBOM() != null; 141 } 142 143 /** 144 * Indicates whether the stream contains the specified BOM. 145 * 146 * @param bom The BOM to check for 147 * @return true if the stream has the specified BOM, otherwise false 148 * if it does not 149 * @throws IllegalArgumentException if the BOM is not one the stream 150 * is configured to detect 151 * @throws IOException if an error reading the first bytes of the stream occurs 152 */ 153 public boolean hasBOM(ByteOrderMark bom) throws IOException { 154 if (!boms.contains(bom)) { 155 throw new IllegalArgumentException("Stream not configure to detect " + bom); 156 } 157 return byteOrderMark != null && getBOM().equals(bom); 158 } 159 160 /** 161 * Return the BOM (Byte Order Mark). 162 * 163 * @return The BOM or null if none 164 * @throws IOException if an error reading the first bytes of the stream occurs 165 */ 166 public ByteOrderMark getBOM() throws IOException { 167 if (firstBytes == null) { 168 fbLength = 0; 169 int max = 0; 170 for (ByteOrderMark bom : boms) { 171 max = Math.max(max, bom.length()); 172 } 173 firstBytes = new int[max]; 174 for (int i = 0; i < firstBytes.length; i++) { 175 firstBytes[i] = in.read(); 176 fbLength++; 177 if (firstBytes[i] < 0) { 178 break; 179 } 180 181 byteOrderMark = find(); 182 if (byteOrderMark != null) { 183 if (!include) { 184 fbLength = 0; 185 } 186 break; 187 } 188 } 189 } 190 return byteOrderMark; 191 } 192 193 /** 194 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 195 * 196 * @return The BOM charset Name or null if no BOM found 197 * @throws IOException if an error reading the first bytes of the stream occurs 198 * 199 */ 200 public String getBOMCharsetName() throws IOException { 201 getBOM(); 202 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 203 } 204 205 /** 206 * This method reads and either preserves or skips the first bytes in the 207 * stream. It behaves like the single-byte <code>read()</code> method, 208 * either returning a valid byte or -1 to indicate that the initial bytes 209 * have been processed already. 210 * @return the byte read (excluding BOM) or -1 if the end of stream 211 * @throws IOException if an I/O error occurs 212 */ 213 private int readFirstBytes() throws IOException { 214 getBOM(); 215 return fbIndex < fbLength ? firstBytes[fbIndex++] : -1; 216 } 217 218 /** 219 * Find a BOM with the specified bytes. 220 * 221 * @return The matched BOM or null if none matched 222 */ 223 private ByteOrderMark find() { 224 for (ByteOrderMark bom : boms) { 225 if (matches(bom)) { 226 return bom; 227 } 228 } 229 return null; 230 } 231 232 /** 233 * Check if the bytes match a BOM. 234 * 235 * @param bom The BOM 236 * @return true if the bytes match the bom, otherwise false 237 */ 238 private boolean matches(ByteOrderMark bom) { 239 if (bom.length() != fbLength) { 240 return false; 241 } 242 for (int i = 0; i < bom.length(); i++) { 243 if (bom.get(i) != firstBytes[i]) { 244 return false; 245 } 246 } 247 return true; 248 } 249 250 //---------------------------------------------------------------------------- 251 // Implementation of InputStream 252 //---------------------------------------------------------------------------- 253 254 /** 255 * Invokes the delegate's <code>read()</code> method, detecting and 256 * optionally skipping BOM. 257 * @return the byte read (excluding BOM) or -1 if the end of stream 258 * @throws IOException if an I/O error occurs 259 */ 260 @Override 261 public int read() throws IOException { 262 int b = readFirstBytes(); 263 return b >= 0 ? b : in.read(); 264 } 265 266 /** 267 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting 268 * and optionally skipping BOM. 269 * @param buf the buffer to read the bytes into 270 * @param off The start offset 271 * @param len The number of bytes to read (excluding BOM) 272 * @return the number of bytes read or -1 if the end of stream 273 * @throws IOException if an I/O error occurs 274 */ 275 @Override 276 public int read(byte[] buf, int off, int len) throws IOException { 277 int firstCount = 0; 278 int b = 0; 279 while (len > 0 && b >= 0) { 280 b = readFirstBytes(); 281 if (b >= 0) { 282 buf[off++] = (byte) (b & 0xFF); 283 len--; 284 firstCount++; 285 } 286 } 287 int secondCount = in.read(buf, off, len); 288 return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount; 289 } 290 291 /** 292 * Invokes the delegate's <code>read(byte[])</code> method, detecting and 293 * optionally skipping BOM. 294 * @param buf the buffer to read the bytes into 295 * @return the number of bytes read (excluding BOM) 296 * or -1 if the end of stream 297 * @throws IOException if an I/O error occurs 298 */ 299 @Override 300 public int read(byte[] buf) throws IOException { 301 return read(buf, 0, buf.length); 302 } 303 304 /** 305 * Invokes the delegate's <code>mark(int)</code> method. 306 * @param readlimit read ahead limit 307 */ 308 @Override 309 public synchronized void mark(int readlimit) { 310 markFbIndex = fbIndex; 311 markedAtStart = firstBytes == null; 312 in.mark(readlimit); 313 } 314 315 /** 316 * Invokes the delegate's <code>reset()</code> method. 317 * @throws IOException if an I/O error occurs 318 */ 319 @Override 320 public synchronized void reset() throws IOException { 321 fbIndex = markFbIndex; 322 if (markedAtStart) { 323 firstBytes = null; 324 } 325 326 in.reset(); 327 } 328 329 /** 330 * Invokes the delegate's <code>skip(long)</code> method, detecting 331 * and optionallyskipping BOM. 332 * @param n the number of bytes to skip 333 * @return the number of bytes to skipped or -1 if the end of stream 334 * @throws IOException if an I/O error occurs 335 */ 336 @Override 337 public long skip(long n) throws IOException { 338 while (n > 0 && readFirstBytes() >= 0) { 339 n--; 340 } 341 return in.skip(n); 342 } 343 }