001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.io.input;
018
019 import java.io.IOException;
020 import java.io.InputStream;
021 import java.util.Arrays;
022 import java.util.List;
023
024 import org.apache.commons.io.ByteOrderMark;
025
026 /**
027 * This class is used to wrap a stream that includes an encoded
028 * {@link ByteOrderMark} as its first bytes.
029 *
030 * This class detects these bytes and, if required, can automatically skip them
031 * and return the subsequent byte as the first byte in the stream.
032 *
033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
034 * <ul>
035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
038 * </ul>
039 *
040 *
041 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
042 * <pre>
043 * BOMInputStream bomIn = new BOMInputStream(in);
044 * if (bomIn.hasBOM()) {
045 * // has a UTF-8 BOM
046 * }
047 * </pre>
048 *
049 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
050 * <pre>
051 * boolean include = true;
052 * BOMInputStream bomIn = new BOMInputStream(in, include);
053 * if (bomIn.hasBOM()) {
054 * // has a UTF-8 BOM
055 * }
056 * </pre>
057 *
058 * <h3>Example 3 - Detect Multiple BOMs</h3>
059 * <pre>
060 * BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
061 * if (bomIn.hasBOM() == false) {
062 * // No BOM found
063 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
064 * // has a UTF-16LE BOM
065 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
066 * // has a UTF-16BE BOM
067 * }
068 * </pre>
069 *
070 * @see org.apache.commons.io.ByteOrderMark
071 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
072 * @version $Id: BOMInputStream.java 1304052 2012-03-22 20:55:29Z ggregory $
073 * @since 2.0
074 */
075 public class BOMInputStream extends ProxyInputStream {
076 private final boolean include;
077 private final List<ByteOrderMark> boms;
078 private ByteOrderMark byteOrderMark;
079 private int[] firstBytes;
080 private int fbLength;
081 private int fbIndex;
082 private int markFbIndex;
083 private boolean markedAtStart;
084
085 /**
086 * Constructs a new BOM InputStream that excludes
087 * a {@link ByteOrderMark#UTF_8} BOM.
088 * @param delegate the InputStream to delegate to
089 */
090 public BOMInputStream(InputStream delegate) {
091 this(delegate, false, ByteOrderMark.UTF_8);
092 }
093
094 /**
095 * Constructs a new BOM InputStream that detects a
096 * a {@link ByteOrderMark#UTF_8} and optionally includes it.
097 * @param delegate the InputStream to delegate to
098 * @param include true to include the UTF-8 BOM or
099 * false to exclude it
100 */
101 public BOMInputStream(InputStream delegate, boolean include) {
102 this(delegate, include, ByteOrderMark.UTF_8);
103 }
104
105 /**
106 * Constructs a new BOM InputStream that excludes
107 * the specified BOMs.
108 * @param delegate the InputStream to delegate to
109 * @param boms The BOMs to detect and exclude
110 */
111 public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
112 this(delegate, false, boms);
113 }
114
115 /**
116 * Constructs a new BOM InputStream that detects the
117 * specified BOMs and optionally includes them.
118 * @param delegate the InputStream to delegate to
119 * @param include true to include the specified BOMs or
120 * false to exclude them
121 * @param boms The BOMs to detect and optionally exclude
122 */
123 public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
124 super(delegate);
125 if (boms == null || boms.length == 0) {
126 throw new IllegalArgumentException("No BOMs specified");
127 }
128 this.include = include;
129 this.boms = Arrays.asList(boms);
130 }
131
132 /**
133 * Indicates whether the stream contains one of the specified BOMs.
134 *
135 * @return true if the stream has one of the specified BOMs, otherwise false
136 * if it does not
137 * @throws IOException if an error reading the first bytes of the stream occurs
138 */
139 public boolean hasBOM() throws IOException {
140 return getBOM() != null;
141 }
142
143 /**
144 * Indicates whether the stream contains the specified BOM.
145 *
146 * @param bom The BOM to check for
147 * @return true if the stream has the specified BOM, otherwise false
148 * if it does not
149 * @throws IllegalArgumentException if the BOM is not one the stream
150 * is configured to detect
151 * @throws IOException if an error reading the first bytes of the stream occurs
152 */
153 public boolean hasBOM(ByteOrderMark bom) throws IOException {
154 if (!boms.contains(bom)) {
155 throw new IllegalArgumentException("Stream not configure to detect " + bom);
156 }
157 return byteOrderMark != null && getBOM().equals(bom);
158 }
159
160 /**
161 * Return the BOM (Byte Order Mark).
162 *
163 * @return The BOM or null if none
164 * @throws IOException if an error reading the first bytes of the stream occurs
165 */
166 public ByteOrderMark getBOM() throws IOException {
167 if (firstBytes == null) {
168 fbLength = 0;
169 int max = 0;
170 for (ByteOrderMark bom : boms) {
171 max = Math.max(max, bom.length());
172 }
173 firstBytes = new int[max];
174 for (int i = 0; i < firstBytes.length; i++) {
175 firstBytes[i] = in.read();
176 fbLength++;
177 if (firstBytes[i] < 0) {
178 break;
179 }
180
181 byteOrderMark = find();
182 if (byteOrderMark != null) {
183 if (!include) {
184 fbLength = 0;
185 }
186 break;
187 }
188 }
189 }
190 return byteOrderMark;
191 }
192
193 /**
194 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
195 *
196 * @return The BOM charset Name or null if no BOM found
197 * @throws IOException if an error reading the first bytes of the stream occurs
198 *
199 */
200 public String getBOMCharsetName() throws IOException {
201 getBOM();
202 return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
203 }
204
205 /**
206 * This method reads and either preserves or skips the first bytes in the
207 * stream. It behaves like the single-byte <code>read()</code> method,
208 * either returning a valid byte or -1 to indicate that the initial bytes
209 * have been processed already.
210 * @return the byte read (excluding BOM) or -1 if the end of stream
211 * @throws IOException if an I/O error occurs
212 */
213 private int readFirstBytes() throws IOException {
214 getBOM();
215 return fbIndex < fbLength ? firstBytes[fbIndex++] : -1;
216 }
217
218 /**
219 * Find a BOM with the specified bytes.
220 *
221 * @return The matched BOM or null if none matched
222 */
223 private ByteOrderMark find() {
224 for (ByteOrderMark bom : boms) {
225 if (matches(bom)) {
226 return bom;
227 }
228 }
229 return null;
230 }
231
232 /**
233 * Check if the bytes match a BOM.
234 *
235 * @param bom The BOM
236 * @return true if the bytes match the bom, otherwise false
237 */
238 private boolean matches(ByteOrderMark bom) {
239 if (bom.length() != fbLength) {
240 return false;
241 }
242 for (int i = 0; i < bom.length(); i++) {
243 if (bom.get(i) != firstBytes[i]) {
244 return false;
245 }
246 }
247 return true;
248 }
249
250 //----------------------------------------------------------------------------
251 // Implementation of InputStream
252 //----------------------------------------------------------------------------
253
254 /**
255 * Invokes the delegate's <code>read()</code> method, detecting and
256 * optionally skipping BOM.
257 * @return the byte read (excluding BOM) or -1 if the end of stream
258 * @throws IOException if an I/O error occurs
259 */
260 @Override
261 public int read() throws IOException {
262 int b = readFirstBytes();
263 return b >= 0 ? b : in.read();
264 }
265
266 /**
267 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting
268 * and optionally skipping BOM.
269 * @param buf the buffer to read the bytes into
270 * @param off The start offset
271 * @param len The number of bytes to read (excluding BOM)
272 * @return the number of bytes read or -1 if the end of stream
273 * @throws IOException if an I/O error occurs
274 */
275 @Override
276 public int read(byte[] buf, int off, int len) throws IOException {
277 int firstCount = 0;
278 int b = 0;
279 while (len > 0 && b >= 0) {
280 b = readFirstBytes();
281 if (b >= 0) {
282 buf[off++] = (byte) (b & 0xFF);
283 len--;
284 firstCount++;
285 }
286 }
287 int secondCount = in.read(buf, off, len);
288 return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount;
289 }
290
291 /**
292 * Invokes the delegate's <code>read(byte[])</code> method, detecting and
293 * optionally skipping BOM.
294 * @param buf the buffer to read the bytes into
295 * @return the number of bytes read (excluding BOM)
296 * or -1 if the end of stream
297 * @throws IOException if an I/O error occurs
298 */
299 @Override
300 public int read(byte[] buf) throws IOException {
301 return read(buf, 0, buf.length);
302 }
303
304 /**
305 * Invokes the delegate's <code>mark(int)</code> method.
306 * @param readlimit read ahead limit
307 */
308 @Override
309 public synchronized void mark(int readlimit) {
310 markFbIndex = fbIndex;
311 markedAtStart = firstBytes == null;
312 in.mark(readlimit);
313 }
314
315 /**
316 * Invokes the delegate's <code>reset()</code> method.
317 * @throws IOException if an I/O error occurs
318 */
319 @Override
320 public synchronized void reset() throws IOException {
321 fbIndex = markFbIndex;
322 if (markedAtStart) {
323 firstBytes = null;
324 }
325
326 in.reset();
327 }
328
329 /**
330 * Invokes the delegate's <code>skip(long)</code> method, detecting
331 * and optionallyskipping BOM.
332 * @param n the number of bytes to skip
333 * @return the number of bytes to skipped or -1 if the end of stream
334 * @throws IOException if an I/O error occurs
335 */
336 @Override
337 public long skip(long n) throws IOException {
338 while (n > 0 && readFirstBytes() >= 0) {
339 n--;
340 }
341 return in.skip(n);
342 }
343 }