001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.io.input;
018
019 import java.io.IOException;
020 import java.io.InputStream;
021 import java.util.Arrays;
022 import java.util.List;
023
024 import org.apache.commons.io.ByteOrderMark;
025
026 /**
027 * This class is used to wrap a stream that includes an encoded
028 * {@link ByteOrderMark} as its first bytes.
029 *
030 * This class detects these bytes and, if required, can automatically skip them
031 * and return the subsequent byte as the first byte in the stream.
032 *
033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
034 * <ul>
035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
038 * </ul>
039 *
040 *
041 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
042 * <pre>
043 * BOMInputStream bomIn = new BOMInputStream(in);
044 * if (bomIn.hasBOM()) {
045 * // has a UTF-8 BOM
046 * }
047 * </pre>
048 *
049 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
050 * <pre>
051 * boolean include = true;
052 * BOMInputStream bomIn = new BOMInputStream(in, include);
053 * if (bomIn.hasBOM()) {
054 * // has a UTF-8 BOM
055 * }
056 * </pre>
057 *
058 * <h3>Example 3 - Detect Multiple BOMs</h3>
059 * <pre>
060 * BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
061 * if (bomIn.hasBOM() == false) {
062 * // No BOM found
063 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
064 * // has a UTF-16LE BOM
065 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
066 * // has a UTF-16BE BOM
067 * }
068 * </pre>
069 *
070 * @see org.apache.commons.io.ByteOrderMark
071 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
072 * @version $Revision: 1052095 $ $Date: 2010-12-22 23:03:20 +0000 (Wed, 22 Dec 2010) $
073 * @since Commons IO 2.0
074 */
075 public class BOMInputStream extends ProxyInputStream {
076 private final boolean include;
077 private final List<ByteOrderMark> boms;
078 private ByteOrderMark byteOrderMark;
079 private int[] firstBytes;
080 private int fbLength;
081 private int fbIndex;
082 private int markFbIndex;
083 private boolean markedAtStart;
084
085 /**
086 * Constructs a new BOM InputStream that excludes
087 * a {@link ByteOrderMark#UTF_8} BOM.
088 * @param delegate the InputStream to delegate to
089 */
090 public BOMInputStream(InputStream delegate) {
091 this(delegate, false, ByteOrderMark.UTF_8);
092 }
093
094 /**
095 * Constructs a new BOM InputStream that detects a
096 * a {@link ByteOrderMark#UTF_8} and optionally includes it.
097 * @param delegate the InputStream to delegate to
098 * @param include true to include the UTF-8 BOM or
099 * false to exclude it
100 */
101 public BOMInputStream(InputStream delegate, boolean include) {
102 this(delegate, include, ByteOrderMark.UTF_8);
103 }
104
105 /**
106 * Constructs a new BOM InputStream that excludes
107 * the specified BOMs.
108 * @param delegate the InputStream to delegate to
109 * @param boms The BOMs to detect and exclude
110 */
111 public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
112 this(delegate, false, boms);
113 }
114
115 /**
116 * Constructs a new BOM InputStream that detects the
117 * specified BOMs and optionally includes them.
118 * @param delegate the InputStream to delegate to
119 * @param include true to include the specified BOMs or
120 * false to exclude them
121 * @param boms The BOMs to detect and optionally exclude
122 */
123 public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
124 super(delegate);
125 if (boms == null || boms.length == 0) {
126 throw new IllegalArgumentException("No BOMs specified");
127 }
128 this.include = include;
129 this.boms = Arrays.asList(boms);
130 }
131
132 /**
133 * Indicates whether the stream contains one of the specified BOMs.
134 *
135 * @return true if the stream has one of the specified BOMs, otherwise false
136 * if it does not
137 * @throws IOException if an error reading the first bytes of the stream occurs
138 */
139 public boolean hasBOM() throws IOException {
140 return (getBOM() != null);
141 }
142
143 /**
144 * Indicates whether the stream contains the specified BOM.
145 *
146 * @param bom The BOM to check for
147 * @return true if the stream has the specified BOM, otherwise false
148 * if it does not
149 * @throws IllegalArgumentException if the BOM is not one the stream
150 * is configured to detect
151 * @throws IOException if an error reading the first bytes of the stream occurs
152 */
153 public boolean hasBOM(ByteOrderMark bom) throws IOException {
154 if (!boms.contains(bom)) {
155 throw new IllegalArgumentException("Stream not configure to detect " + bom);
156 }
157 return (byteOrderMark != null && getBOM().equals(bom));
158 }
159
160 /**
161 * Return the BOM (Byte Order Mark).
162 *
163 * @return The BOM or null if none
164 * @throws IOException if an error reading the first bytes of the stream occurs
165 */
166 public ByteOrderMark getBOM() throws IOException {
167 if (firstBytes == null) {
168 int max = 0;
169 for (ByteOrderMark bom : boms) {
170 max = Math.max(max, bom.length());
171 }
172 firstBytes = new int[max];
173 for (int i = 0; i < firstBytes.length; i++) {
174 firstBytes[i] = in.read();
175 fbLength++;
176 if (firstBytes[i] < 0) {
177 break;
178 }
179
180 byteOrderMark = find();
181 if (byteOrderMark != null) {
182 if (!include) {
183 fbLength = 0;
184 }
185 break;
186 }
187 }
188 }
189 return byteOrderMark;
190 }
191
192 /**
193 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
194 *
195 * @return The BOM charset Name or null if no BOM found
196 * @throws IOException if an error reading the first bytes of the stream occurs
197 *
198 */
199 public String getBOMCharsetName() throws IOException {
200 getBOM();
201 return (byteOrderMark == null ? null : byteOrderMark.getCharsetName());
202 }
203
204 /**
205 * This method reads and either preserves or skips the first bytes in the
206 * stream. It behaves like the single-byte <code>read()</code> method,
207 * either returning a valid byte or -1 to indicate that the initial bytes
208 * have been processed already.
209 * @return the byte read (excluding BOM) or -1 if the end of stream
210 * @throws IOException if an I/O error occurs
211 */
212 private int readFirstBytes() throws IOException {
213 getBOM();
214 return (fbIndex < fbLength) ? firstBytes[fbIndex++] : -1;
215 }
216
217 /**
218 * Find a BOM with the specified bytes.
219 *
220 * @return The matched BOM or null if none matched
221 */
222 private ByteOrderMark find() {
223 for (ByteOrderMark bom : boms) {
224 if (matches(bom)) {
225 return bom;
226 }
227 }
228 return null;
229 }
230
231 /**
232 * Check if the bytes match a BOM.
233 *
234 * @param bom The BOM
235 * @return true if the bytes match the bom, otherwise false
236 */
237 private boolean matches(ByteOrderMark bom) {
238 if (bom.length() != fbLength) {
239 return false;
240 }
241 for (int i = 0; i < bom.length(); i++) {
242 if (bom.get(i) != firstBytes[i]) {
243 return false;
244 }
245 }
246 return true;
247 }
248
249 //----------------------------------------------------------------------------
250 // Implementation of InputStream
251 //----------------------------------------------------------------------------
252
253 /**
254 * Invokes the delegate's <code>read()</code> method, detecting and
255 * optionally skipping BOM.
256 * @return the byte read (excluding BOM) or -1 if the end of stream
257 * @throws IOException if an I/O error occurs
258 */
259 @Override
260 public int read() throws IOException {
261 int b = readFirstBytes();
262 return (b >= 0) ? b : in.read();
263 }
264
265 /**
266 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting
267 * and optionally skipping BOM.
268 * @param buf the buffer to read the bytes into
269 * @param off The start offset
270 * @param len The number of bytes to read (excluding BOM)
271 * @return the number of bytes read or -1 if the end of stream
272 * @throws IOException if an I/O error occurs
273 */
274 @Override
275 public int read(byte[] buf, int off, int len) throws IOException {
276 int firstCount = 0;
277 int b = 0;
278 while ((len > 0) && (b >= 0)) {
279 b = readFirstBytes();
280 if (b >= 0) {
281 buf[off++] = (byte) (b & 0xFF);
282 len--;
283 firstCount++;
284 }
285 }
286 int secondCount = in.read(buf, off, len);
287 return (secondCount < 0) ? (firstCount > 0 ? firstCount : -1) : firstCount + secondCount;
288 }
289
290 /**
291 * Invokes the delegate's <code>read(byte[])</code> method, detecting and
292 * optionally skipping BOM.
293 * @param buf the buffer to read the bytes into
294 * @return the number of bytes read (excluding BOM)
295 * or -1 if the end of stream
296 * @throws IOException if an I/O error occurs
297 */
298 @Override
299 public int read(byte[] buf) throws IOException {
300 return read(buf, 0, buf.length);
301 }
302
303 /**
304 * Invokes the delegate's <code>mark(int)</code> method.
305 * @param readlimit read ahead limit
306 */
307 @Override
308 public synchronized void mark(int readlimit) {
309 markFbIndex = fbIndex;
310 markedAtStart = (firstBytes == null);
311 in.mark(readlimit);
312 }
313
314 /**
315 * Invokes the delegate's <code>reset()</code> method.
316 * @throws IOException if an I/O error occurs
317 */
318 @Override
319 public synchronized void reset() throws IOException {
320 fbIndex = markFbIndex;
321 if (markedAtStart) {
322 firstBytes = null;
323 }
324
325 in.reset();
326 }
327
328 /**
329 * Invokes the delegate's <code>skip(long)</code> method, detecting
330 * and optionallyskipping BOM.
331 * @param n the number of bytes to skip
332 * @return the number of bytes to skipped or -1 if the end of stream
333 * @throws IOException if an I/O error occurs
334 */
335 @Override
336 public long skip(long n) throws IOException {
337 while ((n > 0) && (readFirstBytes() >= 0)) {
338 n--;
339 }
340 return in.skip(n);
341 }
342 }