ByteOrderMark.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.io;

  18. import java.io.Serializable;
  19. import java.nio.charset.StandardCharsets;
  20. import java.util.Locale;
  21. import java.util.Objects;

  22. /**
  23.  * Byte Order Mark (BOM) representation. See {@link org.apache.commons.io.input.BOMInputStream}.
  24.  * <p>
  25.  * We define the follow BOM constants:
  26.  * </p>
  27.  * <ul>
  28.  * <li>{@link #UTF_16BE}</li>
  29.  * <li>{@link #UTF_16LE}</li>
  30.  * <li>{@link #UTF_32BE}</li>
  31.  * <li>{@link #UTF_32LE}</li>
  32.  * <li>{@link #UTF_8}</li>
  33.  * </ul>
  34.  * <h2>Deprecating Serialization</h2>
  35.  * <p>
  36.  * <em>Serialization is deprecated and will be removed in 3.0.</em>
  37.  * </p>
  38.  *
  39.  * @see org.apache.commons.io.input.BOMInputStream
  40.  * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia: Byte Order Mark</a>
  41.  * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing">W3C: Autodetection of Character Encodings
  42.  *      (Non-Normative)</a>
  43.  * @since 2.0
  44.  */
  45. public class ByteOrderMark implements Serializable {

  46.     private static final long serialVersionUID = 1L;

  47.     /**
  48.      * UTF-8 BOM.
  49.      * <p>
  50.      * This BOM is:
  51.      * </p>
  52.      * <pre>
  53.      * 0xEF 0xBB 0xBF
  54.      * </pre>
  55.      */
  56.     public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8.name(), 0xEF, 0xBB, 0xBF);

  57.     /**
  58.      * UTF-16BE BOM (Big-Endian).
  59.      * <p>
  60.      * This BOM is:
  61.      * </p>
  62.      * <pre>
  63.      * 0xFE 0xFF
  64.      * </pre>
  65.      */
  66.     public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE.name(), 0xFE, 0xFF);

  67.     /**
  68.      * UTF-16LE BOM (Little-Endian).
  69.      * <p>
  70.      * This BOM is:
  71.      * </p>
  72.      * <pre>
  73.      * 0xFF 0xFE
  74.      * </pre>
  75.      */
  76.     public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE.name(), 0xFF, 0xFE);

  77.     /**
  78.      * UTF-32BE BOM (Big-Endian).
  79.      * <p>
  80.      * This BOM is:
  81.      * </p>
  82.      * <pre>
  83.      * 0x00 0x00 0xFE 0xFF
  84.      * </pre>
  85.      *
  86.      * @since 2.2
  87.      */
  88.     public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);

  89.     /**
  90.      * UTF-32LE BOM (Little-Endian).
  91.      * <p>
  92.      * This BOM is:
  93.      * </p>
  94.      * <pre>
  95.      * 0xFF 0xFE 0x00 0x00
  96.      * </pre>
  97.      *
  98.      * @since 2.2
  99.      */
  100.     public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);

  101.     /**
  102.      * Unicode BOM character; external form depends on the encoding.
  103.      *
  104.      * @see <a href="https://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
  105.      * @since 2.5
  106.      */
  107.     public static final char UTF_BOM = '\uFEFF';

  108.     /**
  109.      * Charset name.
  110.      */
  111.     private final String charsetName;

  112.     /**
  113.      * Bytes.
  114.      */
  115.     private final int[] bytes;

  116.     /**
  117.      * Constructs a new instance.
  118.      *
  119.      * @param charsetName The name of the charset the BOM represents
  120.      * @param bytes The BOM's bytes
  121.      * @throws IllegalArgumentException if the charsetName is zero length
  122.      * @throws IllegalArgumentException if the bytes are zero length
  123.      */
  124.     public ByteOrderMark(final String charsetName, final int... bytes) {
  125.         Objects.requireNonNull(charsetName, "charsetName");
  126.         Objects.requireNonNull(bytes, "bytes");
  127.         if (charsetName.isEmpty()) {
  128.             throw new IllegalArgumentException("No charsetName specified");
  129.         }
  130.         if (bytes.length == 0) {
  131.             throw new IllegalArgumentException("No bytes specified");
  132.         }
  133.         this.charsetName = charsetName;
  134.         this.bytes = bytes.clone();
  135.     }

  136.     /**
  137.      * Indicates if this instance's bytes equals another.
  138.      *
  139.      * @param obj The object to compare to
  140.      * @return true if the bom's bytes are equal, otherwise
  141.      * false
  142.      */
  143.     @Override
  144.     public boolean equals(final Object obj) {
  145.         if (!(obj instanceof ByteOrderMark)) {
  146.             return false;
  147.         }
  148.         final ByteOrderMark bom = (ByteOrderMark) obj;
  149.         if (bytes.length != bom.length()) {
  150.             return false;
  151.         }
  152.         for (int i = 0; i < bytes.length; i++) {
  153.             if (bytes[i] != bom.get(i)) {
  154.                 return false;
  155.             }
  156.         }
  157.         return true;
  158.     }

  159.     /**
  160.      * Gets the byte at the specified position.
  161.      *
  162.      * @param pos The position
  163.      * @return The specified byte
  164.      */
  165.     public int get(final int pos) {
  166.         return bytes[pos];
  167.     }

  168.     /**
  169.      * Gets a copy of the BOM's bytes.
  170.      *
  171.      * @return a copy of the BOM's bytes
  172.      */
  173.     public byte[] getBytes() {
  174.         final byte[] copy = IOUtils.byteArray(bytes.length);
  175.         for (int i = 0; i < bytes.length; i++) {
  176.             copy[i] = (byte) bytes[i];
  177.         }
  178.         return copy;
  179.     }

  180.     /**
  181.      * Gets the name of the {@link java.nio.charset.Charset} the BOM represents.
  182.      *
  183.      * @return the character set name
  184.      */
  185.     public String getCharsetName() {
  186.         return charsetName;
  187.     }

  188.     /**
  189.      * Computes the hash code for this BOM.
  190.      *
  191.      * @return the hash code for this BOM.
  192.      * @see Object#hashCode()
  193.      */
  194.     @Override
  195.     public int hashCode() {
  196.         int hashCode = getClass().hashCode();
  197.         for (final int b : bytes) {
  198.             hashCode += b;
  199.         }
  200.         return hashCode;
  201.     }

  202.     /**
  203.      * Gets the length of the BOM's bytes.
  204.      *
  205.      * @return the length of the BOM's bytes
  206.      */
  207.     public int length() {
  208.         return bytes.length;
  209.     }

  210.     /**
  211.      * Converts this instance to a String representation of the BOM.
  212.      *
  213.      * @return the length of the BOM's bytes
  214.      */
  215.     @Override
  216.     public String toString() {
  217.         final StringBuilder builder = new StringBuilder();
  218.         builder.append(getClass().getSimpleName());
  219.         builder.append('[');
  220.         builder.append(charsetName);
  221.         builder.append(": ");
  222.         for (int i = 0; i < bytes.length; i++) {
  223.             if (i > 0) {
  224.                 builder.append(",");
  225.             }
  226.             builder.append("0x");
  227.             builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
  228.         }
  229.         builder.append(']');
  230.         return builder.toString();
  231.     }

  232. }