001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.io.input;
018
019 import java.io.IOException;
020 import java.io.InputStream;
021 import java.util.Arrays;
022 import java.util.Comparator;
023 import java.util.List;
024
025 import org.apache.commons.io.ByteOrderMark;
026
027 /**
028 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
029 *
030 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
031 * first byte in the stream.
032 *
033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
034 * <ul>
035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
038 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
039 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
040 * </ul>
041 *
042 *
043 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
044 *
045 * <pre>
046 * BOMInputStream bomIn = new BOMInputStream(in);
047 * if (bomIn.hasBOM()) {
048 * // has a UTF-8 BOM
049 * }
050 * </pre>
051 *
052 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
053 *
054 * <pre>
055 * boolean include = true;
056 * BOMInputStream bomIn = new BOMInputStream(in, include);
057 * if (bomIn.hasBOM()) {
058 * // has a UTF-8 BOM
059 * }
060 * </pre>
061 *
062 * <h3>Example 3 - Detect Multiple BOMs</h3>
063 *
064 * <pre>
065 * BOMInputStream bomIn = new BOMInputStream(in,
066 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
067 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
068 * );
069 * if (bomIn.hasBOM() == false) {
070 * // No BOM found
071 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
072 * // has a UTF-16LE BOM
073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
074 * // has a UTF-16BE BOM
075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
076 * // has a UTF-32LE BOM
077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
078 * // has a UTF-32BE BOM
079 * }
080 * </pre>
081 *
082 * @see org.apache.commons.io.ByteOrderMark
083 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
084 * @version $Id: BOMInputStream.java 1346400 2012-06-05 14:48:01Z ggregory $
085 * @since 2.0
086 */
087 public class BOMInputStream extends ProxyInputStream {
088 private final boolean include;
089 /**
090 * BOMs are sorted from longest to shortest.
091 */
092 private final List<ByteOrderMark> boms;
093 private ByteOrderMark byteOrderMark;
094 private int[] firstBytes;
095 private int fbLength;
096 private int fbIndex;
097 private int markFbIndex;
098 private boolean markedAtStart;
099
100 /**
101 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
102 *
103 * @param delegate
104 * the InputStream to delegate to
105 */
106 public BOMInputStream(InputStream delegate) {
107 this(delegate, false, ByteOrderMark.UTF_8);
108 }
109
110 /**
111 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
112 *
113 * @param delegate
114 * the InputStream to delegate to
115 * @param include
116 * true to include the UTF-8 BOM or false to exclude it
117 */
118 public BOMInputStream(InputStream delegate, boolean include) {
119 this(delegate, include, ByteOrderMark.UTF_8);
120 }
121
122 /**
123 * Constructs a new BOM InputStream that excludes the specified BOMs.
124 *
125 * @param delegate
126 * the InputStream to delegate to
127 * @param boms
128 * The BOMs to detect and exclude
129 */
130 public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
131 this(delegate, false, boms);
132 }
133
134 /**
135 * Compares ByteOrderMark objects in descending length order.
136 */
137 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
138
139 public int compare(ByteOrderMark bom1, ByteOrderMark bom2) {
140 int len1 = bom1.length();
141 int len2 = bom2.length();
142 if (len1 > len2) {
143 return -1;
144 }
145 if (len2 > len1) {
146 return 1;
147 }
148 return 0;
149 }
150 };
151
152 /**
153 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
154 *
155 * @param delegate
156 * the InputStream to delegate to
157 * @param include
158 * true to include the specified BOMs or false to exclude them
159 * @param boms
160 * The BOMs to detect and optionally exclude
161 */
162 public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
163 super(delegate);
164 if (boms == null || boms.length == 0) {
165 throw new IllegalArgumentException("No BOMs specified");
166 }
167 this.include = include;
168 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
169 Arrays.sort(boms, ByteOrderMarkLengthComparator);
170 this.boms = Arrays.asList(boms);
171
172 }
173
174 /**
175 * Indicates whether the stream contains one of the specified BOMs.
176 *
177 * @return true if the stream has one of the specified BOMs, otherwise false if it does not
178 * @throws IOException
179 * if an error reading the first bytes of the stream occurs
180 */
181 public boolean hasBOM() throws IOException {
182 return getBOM() != null;
183 }
184
185 /**
186 * Indicates whether the stream contains the specified BOM.
187 *
188 * @param bom
189 * The BOM to check for
190 * @return true if the stream has the specified BOM, otherwise false if it does not
191 * @throws IllegalArgumentException
192 * if the BOM is not one the stream is configured to detect
193 * @throws IOException
194 * if an error reading the first bytes of the stream occurs
195 */
196 public boolean hasBOM(ByteOrderMark bom) throws IOException {
197 if (!boms.contains(bom)) {
198 throw new IllegalArgumentException("Stream not configure to detect " + bom);
199 }
200 return byteOrderMark != null && getBOM().equals(bom);
201 }
202
203 /**
204 * Return the BOM (Byte Order Mark).
205 *
206 * @return The BOM or null if none
207 * @throws IOException
208 * if an error reading the first bytes of the stream occurs
209 */
210 public ByteOrderMark getBOM() throws IOException {
211 if (firstBytes == null) {
212 fbLength = 0;
213 // BOMs are sorted from longest to shortest
214 final int maxBomSize = boms.get(0).length();
215 firstBytes = new int[maxBomSize];
216 // Read first maxBomSize bytes
217 for (int i = 0; i < firstBytes.length; i++) {
218 firstBytes[i] = in.read();
219 fbLength++;
220 if (firstBytes[i] < 0) {
221 break;
222 }
223 }
224 // match BOM in firstBytes
225 byteOrderMark = find();
226 if (byteOrderMark != null) {
227 if (!include) {
228 if (byteOrderMark.length() < firstBytes.length) {
229 fbIndex = byteOrderMark.length();
230 } else {
231 fbLength = 0;
232 }
233 }
234 }
235 }
236 return byteOrderMark;
237 }
238
239 /**
240 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
241 *
242 * @return The BOM charset Name or null if no BOM found
243 * @throws IOException
244 * if an error reading the first bytes of the stream occurs
245 *
246 */
247 public String getBOMCharsetName() throws IOException {
248 getBOM();
249 return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
250 }
251
252 /**
253 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
254 * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
255 * processed already.
256 *
257 * @return the byte read (excluding BOM) or -1 if the end of stream
258 * @throws IOException
259 * if an I/O error occurs
260 */
261 private int readFirstBytes() throws IOException {
262 getBOM();
263 return fbIndex < fbLength ? firstBytes[fbIndex++] : -1;
264 }
265
266 /**
267 * Find a BOM with the specified bytes.
268 *
269 * @return The matched BOM or null if none matched
270 */
271 private ByteOrderMark find() {
272 for (ByteOrderMark bom : boms) {
273 if (matches(bom)) {
274 return bom;
275 }
276 }
277 return null;
278 }
279
280 /**
281 * Check if the bytes match a BOM.
282 *
283 * @param bom
284 * The BOM
285 * @return true if the bytes match the bom, otherwise false
286 */
287 private boolean matches(ByteOrderMark bom) {
288 // if (bom.length() != fbLength) {
289 // return false;
290 // }
291 // firstBytes may be bigger than the BOM bytes
292 for (int i = 0; i < bom.length(); i++) {
293 if (bom.get(i) != firstBytes[i]) {
294 return false;
295 }
296 }
297 return true;
298 }
299
300 // ----------------------------------------------------------------------------
301 // Implementation of InputStream
302 // ----------------------------------------------------------------------------
303
304 /**
305 * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
306 *
307 * @return the byte read (excluding BOM) or -1 if the end of stream
308 * @throws IOException
309 * if an I/O error occurs
310 */
311 @Override
312 public int read() throws IOException {
313 int b = readFirstBytes();
314 return b >= 0 ? b : in.read();
315 }
316
317 /**
318 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
319 *
320 * @param buf
321 * the buffer to read the bytes into
322 * @param off
323 * The start offset
324 * @param len
325 * The number of bytes to read (excluding BOM)
326 * @return the number of bytes read or -1 if the end of stream
327 * @throws IOException
328 * if an I/O error occurs
329 */
330 @Override
331 public int read(byte[] buf, int off, int len) throws IOException {
332 int firstCount = 0;
333 int b = 0;
334 while (len > 0 && b >= 0) {
335 b = readFirstBytes();
336 if (b >= 0) {
337 buf[off++] = (byte) (b & 0xFF);
338 len--;
339 firstCount++;
340 }
341 }
342 int secondCount = in.read(buf, off, len);
343 return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount;
344 }
345
346 /**
347 * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
348 *
349 * @param buf
350 * the buffer to read the bytes into
351 * @return the number of bytes read (excluding BOM) or -1 if the end of stream
352 * @throws IOException
353 * if an I/O error occurs
354 */
355 @Override
356 public int read(byte[] buf) throws IOException {
357 return read(buf, 0, buf.length);
358 }
359
360 /**
361 * Invokes the delegate's <code>mark(int)</code> method.
362 *
363 * @param readlimit
364 * read ahead limit
365 */
366 @Override
367 public synchronized void mark(int readlimit) {
368 markFbIndex = fbIndex;
369 markedAtStart = firstBytes == null;
370 in.mark(readlimit);
371 }
372
373 /**
374 * Invokes the delegate's <code>reset()</code> method.
375 *
376 * @throws IOException
377 * if an I/O error occurs
378 */
379 @Override
380 public synchronized void reset() throws IOException {
381 fbIndex = markFbIndex;
382 if (markedAtStart) {
383 firstBytes = null;
384 }
385
386 in.reset();
387 }
388
389 /**
390 * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM.
391 *
392 * @param n
393 * the number of bytes to skip
394 * @return the number of bytes to skipped or -1 if the end of stream
395 * @throws IOException
396 * if an I/O error occurs
397 */
398 @Override
399 public long skip(long n) throws IOException {
400 while (n > 0 && readFirstBytes() >= 0) {
401 n--;
402 }
403 return in.skip(n);
404 }
405 }