View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io;
18  
19  import java.io.Serializable;
20  import java.nio.charset.StandardCharsets;
21  import java.util.Locale;
22  import java.util.Objects;
23  
24  /**
25   * Byte Order Mark (BOM) representation. See {@link org.apache.commons.io.input.BOMInputStream}.
26   * <p>
27   * We define the follow BOM constants:
28   * </p>
29   * <ul>
30   * <li>{@link #UTF_16BE}</li>
31   * <li>{@link #UTF_16LE}</li>
32   * <li>{@link #UTF_32BE}</li>
33   * <li>{@link #UTF_32LE}</li>
34   * <li>{@link #UTF_8}</li>
35   * </ul>
36   * <h2>Deprecating Serialization</h2>
37   * <p>
38   * <em>Serialization is deprecated and will be removed in 3.0.</em>
39   * </p>
40   *
41   * @see org.apache.commons.io.input.BOMInputStream
42   * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia: Byte Order Mark</a>
43   * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing">W3C: Autodetection of Character Encodings
44   *      (Non-Normative)</a>
45   * @since 2.0
46   */
47  public class ByteOrderMark implements Serializable {
48  
49      private static final long serialVersionUID = 1L;
50  
51      /**
52       * UTF-8 BOM.
53       * <p>
54       * This BOM is:
55       * </p>
56       * <pre>
57       * 0xEF 0xBB 0xBF
58       * </pre>
59       */
60      public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8.name(), 0xEF, 0xBB, 0xBF);
61  
62      /**
63       * UTF-16BE BOM (Big-Endian).
64       * <p>
65       * This BOM is:
66       * </p>
67       * <pre>
68       * 0xFE 0xFF
69       * </pre>
70       */
71      public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE.name(), 0xFE, 0xFF);
72  
73      /**
74       * UTF-16LE BOM (Little-Endian).
75       * <p>
76       * This BOM is:
77       * </p>
78       * <pre>
79       * 0xFF 0xFE
80       * </pre>
81       */
82      public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE.name(), 0xFF, 0xFE);
83  
84      /**
85       * UTF-32BE BOM (Big-Endian).
86       * <p>
87       * This BOM is:
88       * </p>
89       * <pre>
90       * 0x00 0x00 0xFE 0xFF
91       * </pre>
92       *
93       * @since 2.2
94       */
95      public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);
96  
97      /**
98       * UTF-32LE BOM (Little-Endian).
99       * <p>
100      * This BOM is:
101      * </p>
102      * <pre>
103      * 0xFF 0xFE 0x00 0x00
104      * </pre>
105      *
106      * @since 2.2
107      */
108     public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);
109 
110     /**
111      * Unicode BOM character; external form depends on the encoding.
112      *
113      * @see <a href="https://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
114      * @since 2.5
115      */
116     public static final char UTF_BOM = '\uFEFF';
117 
118     /**
119      * Charset name.
120      */
121     private final String charsetName;
122 
123     /**
124      * Bytes.
125      */
126     private final int[] bytes;
127 
128     /**
129      * Constructs a new instance.
130      *
131      * @param charsetName The name of the charset the BOM represents
132      * @param bytes The BOM's bytes
133      * @throws IllegalArgumentException if the charsetName is zero length
134      * @throws IllegalArgumentException if the bytes are zero length
135      */
136     public ByteOrderMark(final String charsetName, final int... bytes) {
137         Objects.requireNonNull(charsetName, "charsetName");
138         Objects.requireNonNull(bytes, "bytes");
139         if (charsetName.isEmpty()) {
140             throw new IllegalArgumentException("No charsetName specified");
141         }
142         if (bytes.length == 0) {
143             throw new IllegalArgumentException("No bytes specified");
144         }
145         this.charsetName = charsetName;
146         this.bytes = bytes.clone();
147     }
148 
149     /**
150      * Indicates if this instance's bytes equals another.
151      *
152      * @param obj The object to compare to
153      * @return true if the bom's bytes are equal, otherwise
154      * false
155      */
156     @Override
157     public boolean equals(final Object obj) {
158         if (!(obj instanceof ByteOrderMark)) {
159             return false;
160         }
161         final ByteOrderMark bom = (ByteOrderMark) obj;
162         if (bytes.length != bom.length()) {
163             return false;
164         }
165         for (int i = 0; i < bytes.length; i++) {
166             if (bytes[i] != bom.get(i)) {
167                 return false;
168             }
169         }
170         return true;
171     }
172 
173     /**
174      * Gets the byte at the specified position.
175      *
176      * @param pos The position
177      * @return The specified byte
178      */
179     public int get(final int pos) {
180         return bytes[pos];
181     }
182 
183     /**
184      * Gets a copy of the BOM's bytes.
185      *
186      * @return a copy of the BOM's bytes
187      */
188     public byte[] getBytes() {
189         final byte[] copy = IOUtils.byteArray(bytes.length);
190         for (int i = 0; i < bytes.length; i++) {
191             copy[i] = (byte) bytes[i];
192         }
193         return copy;
194     }
195 
196     /**
197      * Gets the name of the {@link java.nio.charset.Charset} the BOM represents.
198      *
199      * @return the character set name
200      */
201     public String getCharsetName() {
202         return charsetName;
203     }
204 
205     /**
206      * Computes the hash code for this BOM.
207      *
208      * @return the hash code for this BOM.
209      * @see Object#hashCode()
210      */
211     @Override
212     public int hashCode() {
213         int hashCode = getClass().hashCode();
214         for (final int b : bytes) {
215             hashCode += b;
216         }
217         return hashCode;
218     }
219 
220     /**
221      * Gets the length of the BOM's bytes.
222      *
223      * @return the length of the BOM's bytes
224      */
225     public int length() {
226         return bytes.length;
227     }
228 
229     /**
230      * Converts this instance to a String representation of the BOM.
231      *
232      * @return the length of the BOM's bytes
233      */
234     @Override
235     public String toString() {
236         final StringBuilder builder = new StringBuilder();
237         builder.append(getClass().getSimpleName());
238         builder.append('[');
239         builder.append(charsetName);
240         builder.append(": ");
241         for (int i = 0; i < bytes.length; i++) {
242             if (i > 0) {
243                 builder.append(",");
244             }
245             builder.append("0x");
246             builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
247         }
248         builder.append(']');
249         return builder.toString();
250     }
251 
252 }