001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io;
018
019import java.io.Serializable;
020import java.nio.charset.StandardCharsets;
021import java.util.Locale;
022import java.util.Objects;
023
024/**
025 * Byte Order Mark (BOM) representation. See {@link org.apache.commons.io.input.BOMInputStream}.
026 * <p>
027 * We define the follow BOM constants:
028 * </p>
029 * <ul>
030 * <li>{@link #UTF_16BE}</li>
031 * <li>{@link #UTF_16LE}</li>
032 * <li>{@link #UTF_32BE}</li>
033 * <li>{@link #UTF_32LE}</li>
034 * <li>{@link #UTF_8}</li>
035 * </ul>
036 * <h2>Deprecating Serialization</h2>
037 * <p>
038 * <em>Serialization is deprecated and will be removed in 3.0.</em>
039 * </p>
040 *
041 * @see org.apache.commons.io.input.BOMInputStream
042 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia: Byte Order Mark</a>
043 * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing">W3C: Autodetection of Character Encodings
044 *      (Non-Normative)</a>
045 * @since 2.0
046 */
047public class ByteOrderMark implements Serializable {
048
049    private static final long serialVersionUID = 1L;
050
051    /**
052     * UTF-8 BOM.
053     * <p>
054     * This BOM is:
055     * </p>
056     * <pre>
057     * 0xEF 0xBB 0xBF
058     * </pre>
059     */
060    public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8.name(), 0xEF, 0xBB, 0xBF);
061
062    /**
063     * UTF-16BE BOM (Big-Endian).
064     * <p>
065     * This BOM is:
066     * </p>
067     * <pre>
068     * 0xFE 0xFF
069     * </pre>
070     */
071    public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE.name(), 0xFE, 0xFF);
072
073    /**
074     * UTF-16LE BOM (Little-Endian).
075     * <p>
076     * This BOM is:
077     * </p>
078     * <pre>
079     * 0xFF 0xFE
080     * </pre>
081     */
082    public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE.name(), 0xFF, 0xFE);
083
084    /**
085     * UTF-32BE BOM (Big-Endian).
086     * <p>
087     * This BOM is:
088     * </p>
089     * <pre>
090     * 0x00 0x00 0xFE 0xFF
091     * </pre>
092     *
093     * @since 2.2
094     */
095    public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);
096
097    /**
098     * UTF-32LE BOM (Little-Endian).
099     * <p>
100     * This BOM is:
101     * </p>
102     * <pre>
103     * 0xFF 0xFE 0x00 0x00
104     * </pre>
105     *
106     * @since 2.2
107     */
108    public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);
109
110    /**
111     * Unicode BOM character; external form depends on the encoding.
112     *
113     * @see <a href="https://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
114     * @since 2.5
115     */
116    public static final char UTF_BOM = '\uFEFF';
117
118    /**
119     * Charset name.
120     */
121    private final String charsetName;
122
123    /**
124     * Bytes.
125     */
126    private final int[] bytes;
127
128    /**
129     * Constructs a new instance.
130     *
131     * @param charsetName The name of the charset the BOM represents
132     * @param bytes The BOM's bytes
133     * @throws IllegalArgumentException if the charsetName is zero length
134     * @throws IllegalArgumentException if the bytes are zero length
135     */
136    public ByteOrderMark(final String charsetName, final int... bytes) {
137        Objects.requireNonNull(charsetName, "charsetName");
138        Objects.requireNonNull(bytes, "bytes");
139        if (charsetName.isEmpty()) {
140            throw new IllegalArgumentException("No charsetName specified");
141        }
142        if (bytes.length == 0) {
143            throw new IllegalArgumentException("No bytes specified");
144        }
145        this.charsetName = charsetName;
146        this.bytes = bytes.clone();
147    }
148
149    /**
150     * Indicates if this instance's bytes equals another.
151     *
152     * @param obj The object to compare to
153     * @return true if the bom's bytes are equal, otherwise
154     * false
155     */
156    @Override
157    public boolean equals(final Object obj) {
158        if (!(obj instanceof ByteOrderMark)) {
159            return false;
160        }
161        final ByteOrderMark bom = (ByteOrderMark) obj;
162        if (bytes.length != bom.length()) {
163            return false;
164        }
165        for (int i = 0; i < bytes.length; i++) {
166            if (bytes[i] != bom.get(i)) {
167                return false;
168            }
169        }
170        return true;
171    }
172
173    /**
174     * Gets the byte at the specified position.
175     *
176     * @param pos The position
177     * @return The specified byte
178     */
179    public int get(final int pos) {
180        return bytes[pos];
181    }
182
183    /**
184     * Gets a copy of the BOM's bytes.
185     *
186     * @return a copy of the BOM's bytes
187     */
188    public byte[] getBytes() {
189        final byte[] copy = IOUtils.byteArray(bytes.length);
190        for (int i = 0; i < bytes.length; i++) {
191            copy[i] = (byte) bytes[i];
192        }
193        return copy;
194    }
195
196    /**
197     * Gets the name of the {@link java.nio.charset.Charset} the BOM represents.
198     *
199     * @return the character set name
200     */
201    public String getCharsetName() {
202        return charsetName;
203    }
204
205    /**
206     * Computes the hash code for this BOM.
207     *
208     * @return the hash code for this BOM.
209     * @see Object#hashCode()
210     */
211    @Override
212    public int hashCode() {
213        int hashCode = getClass().hashCode();
214        for (final int b : bytes) {
215            hashCode += b;
216        }
217        return hashCode;
218    }
219
220    /**
221     * Gets the length of the BOM's bytes.
222     *
223     * @return the length of the BOM's bytes
224     */
225    public int length() {
226        return bytes.length;
227    }
228
229    /**
230     * Converts this instance to a String representation of the BOM.
231     *
232     * @return the length of the BOM's bytes
233     */
234    @Override
235    public String toString() {
236        final StringBuilder builder = new StringBuilder();
237        builder.append(getClass().getSimpleName());
238        builder.append('[');
239        builder.append(charsetName);
240        builder.append(": ");
241        for (int i = 0; i < bytes.length; i++) {
242            if (i > 0) {
243                builder.append(",");
244            }
245            builder.append("0x");
246            builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
247        }
248        builder.append(']');
249        return builder.toString();
250    }
251
252}