View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io;
18  
19  import java.io.Serializable;
20  import java.nio.charset.StandardCharsets;
21  import java.util.Locale;
22  import java.util.Objects;
23  
24  /**
25   * Byte Order Mark (BOM) representation. See {@link org.apache.commons.io.input.BOMInputStream}.
26   * <p>
27   * We define the follow BOM constants:
28   * </p>
29   * <ul>
30   * <li>{@link #UTF_16BE}</li>
31   * <li>{@link #UTF_16LE}</li>
32   * <li>{@link #UTF_32BE}</li>
33   * <li>{@link #UTF_32LE}</li>
34   * <li>{@link #UTF_8}</li>
35   * </ul>
36   * <h2>Deprecating Serialization</h2>
37   * <p>
38   * <em>Serialization is deprecated and will be removed in 3.0.</em>
39   * </p>
40   *
41   * @see org.apache.commons.io.input.BOMInputStream
42   * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia: Byte Order Mark</a>
43   * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing">W3C: Autodetection of Character Encodings
44   *      (Non-Normative)</a>
45   * @since 2.0
46   */
47  public class ByteOrderMark implements Serializable {
48  
49      private static final long serialVersionUID = 1L;
50  
51      /**
52       * UTF-8 BOM.
53       * <p>
54       * This BOM is:
55       * </p>
56       * <pre>
57       * 0xEF 0xBB 0xBF
58       * </pre>
59       */
60      public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8.name(), 0xEF, 0xBB, 0xBF);
61  
62      /**
63       * UTF-16BE BOM (Big-Endian).
64       * <p>
65       * This BOM is:
66       * </p>
67       * <pre>
68       * 0xFE 0xFF
69       * </pre>
70       */
71      public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE.name(), 0xFE, 0xFF);
72  
73      /**
74       * UTF-16LE BOM (Little-Endian).
75       * <p>
76       * This BOM is:
77       * </p>
78       * <pre>
79       * 0xFF 0xFE
80       * </pre>
81       */
82      public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE.name(), 0xFF, 0xFE);
83  
84      /**
85       * UTF-32BE BOM (Big-Endian).
86       * <p>
87       * This BOM is:
88       * </p>
89       * <pre>
90       * 0x00 0x00 0xFE 0xFF
91       * </pre>
92       *
93       * @since 2.2
94       */
95      public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);
96  
97      /**
98       * UTF-32LE BOM (Little-Endian).
99       * <p>
100      * This BOM is:
101      * </p>
102      * <pre>
103      * 0xFF 0xFE 0x00 0x00
104      * </pre>
105      *
106      * @since 2.2
107      */
108     public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);
109 
110     /**
111      * Unicode BOM character; external form depends on the encoding.
112      *
113      * @see <a href="https://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
114      * @since 2.5
115      */
116     public static final char UTF_BOM = '\uFEFF';
117 
118     /**
119      * Charset name.
120      */
121     private final String charsetName;
122 
123     /**
124      * Bytes.
125      */
126     private final int[] bytes;
127 
128     /**
129      * Constructs a new instance.
130      *
131      * @param charsetName The name of the charset the BOM represents
132      * @param bytes The BOM's bytes
133      * @throws IllegalArgumentException if the charsetName is zero length
134      * @throws IllegalArgumentException if the bytes are zero length
135      */
136     public ByteOrderMark(final String charsetName, final int... bytes) {
137         Objects.requireNonNull(charsetName, "charsetName");
138         Objects.requireNonNull(bytes, "bytes");
139         if (charsetName.isEmpty()) {
140             throw new IllegalArgumentException("No charsetName specified");
141         }
142         if (bytes.length == 0) {
143             throw new IllegalArgumentException("No bytes specified");
144         }
145         this.charsetName = charsetName;
146         this.bytes = bytes.clone();
147     }
148 
149     /**
150      * Indicates if this instance's bytes equals another.
151      *
152      * @param obj The object to compare to
153      * @return true if the bom's bytes are equal, otherwise
154      * false
155      */
156     @Override
157     public boolean equals(final Object obj) {
158         if (!(obj instanceof ByteOrderMark)) {
159             return false;
160         }
161         final ByteOrderMark bom = (ByteOrderMark) obj;
162         if (bytes.length != bom.length()) {
163             return false;
164         }
165         for (int i = 0; i < bytes.length; i++) {
166             if (bytes[i] != bom.get(i)) {
167                 return false;
168             }
169         }
170         return true;
171     }
172 
173     /**
174      * Gets the byte at the specified position.
175      *
176      * @param pos The position
177      * @return The specified byte
178      */
179     public int get(final int pos) {
180         return bytes[pos];
181     }
182 
183     /**
184      * Gets a copy of the BOM's bytes.
185      *
186      * @return a copy of the BOM's bytes
187      */
188     public byte[] getBytes() {
189         final byte[] copy = IOUtils.byteArray(bytes.length);
190         for (int i = 0; i < bytes.length; i++) {
191             copy[i] = (byte) bytes[i];
192         }
193         return copy;
194     }
195 
196     /**
197      * Gets the name of the {@link java.nio.charset.Charset} the BOM represents.
198      *
199      * @return the character set name
200      */
201     public String getCharsetName() {
202         return charsetName;
203     }
204 
205     int[] getRawBytes() {
206         return bytes;
207     }
208 
209     /**
210      * Computes the hash code for this BOM.
211      *
212      * @return the hash code for this BOM.
213      * @see Object#hashCode()
214      */
215     @Override
216     public int hashCode() {
217         int hashCode = getClass().hashCode();
218         for (final int b : bytes) {
219             hashCode += b;
220         }
221         return hashCode;
222     }
223 
224     /**
225      * Gets the length of the BOM's bytes.
226      *
227      * @return the length of the BOM's bytes
228      */
229     public int length() {
230         return bytes.length;
231     }
232 
233     /**
234      * Tests whether the given array starts with the bytes for this BOM.
235      *
236      * @param test the array to test.
237      * @return whether the given array starts with the bytes for this BOM.
238      * @since 2.19.0
239      */
240     public boolean matches(final int[] test) {
241         // Our test are never null.
242         if (bytes == test) {
243             return true;
244         }
245         if (test == null) {
246             return false;
247         }
248         final int length = bytes.length;
249         if (test.length < length) {
250             return false;
251         }
252         for (int i = 0; i < length; i++) {
253             if (bytes[i] != test[i]) {
254                 return false;
255             }
256         }
257         return true;
258     }
259 
260     /**
261      * Converts this instance to a String representation of the BOM.
262      *
263      * @return the length of the BOM's bytes
264      */
265     @Override
266     public String toString() {
267         final StringBuilder builder = new StringBuilder();
268         builder.append(getClass().getSimpleName());
269         builder.append('[');
270         builder.append(charsetName);
271         builder.append(": ");
272         for (int i = 0; i < bytes.length; i++) {
273             if (i > 0) {
274                 builder.append(",");
275             }
276             builder.append("0x");
277             builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
278         }
279         builder.append(']');
280         return builder.toString();
281     }
282 
283 }