BOMInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.io.input;

  18. import static org.apache.commons.io.IOUtils.EOF;

  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.util.Arrays;
  22. import java.util.Comparator;
  23. import java.util.List;
  24. import java.util.Objects;

  25. import org.apache.commons.io.ByteOrderMark;
  26. import org.apache.commons.io.IOUtils;

  27. /**
  28.  * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
  29.  * <p>
  30.  * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
  31.  * first byte in the stream.
  32.  * </p>
  33.  * <p>
  34.  * The {@link ByteOrderMark} implementation has the following predefined BOMs:
  35.  * </p>
  36.  * <ul>
  37.  * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
  38.  * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
  39.  * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
  40.  * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
  41.  * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
  42.  * </ul>
  43.  * <p>
  44.  * To build an instance, use {@link Builder}.
  45.  * </p>
  46.  * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
  47.  *
  48.  * <pre>
  49.  * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
  50.  * if (bomIn.hasBOM()) {
  51.  *     // has a UTF-8 BOM
  52.  * }
  53.  * </pre>
  54.  *
  55.  * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
  56.  *
  57.  * <pre>
  58.  * boolean include = true;
  59.  * BOMInputStream bomIn = BOMInputStream.builder()
  60.  *     .setInputStream(in)
  61.  *     .setInclude(include)
  62.  *     .get();
  63.  * if (bomIn.hasBOM()) {
  64.  *     // has a UTF-8 BOM
  65.  * }
  66.  * </pre>
  67.  *
  68.  * <h2>Example 3 - Detecting Multiple BOMs</h2>
  69.  *
  70.  * <pre>
  71.  * BOMInputStream bomIn = BOMInputStream.builder()
  72.  *   .setInputStream(in)
  73.  *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
  74.  *   .get();
  75.  * if (bomIn.hasBOM() == false) {
  76.  *     // No BOM found
  77.  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
  78.  *     // has a UTF-16LE BOM
  79.  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
  80.  *     // has a UTF-16BE BOM
  81.  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
  82.  *     // has a UTF-32LE BOM
  83.  * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
  84.  *     // has a UTF-32BE BOM
  85.  * }
  86.  * </pre>
  87.  * <p>
  88.  * To build an instance, use {@link Builder}.
  89.  * </p>
  90.  *
  91.  * @see Builder
  92.  * @see org.apache.commons.io.ByteOrderMark
  93.  * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
  94.  * @since 2.0
  95.  */
  96. public class BOMInputStream extends ProxyInputStream {

  97.     // @formatter:off
  98.     /**
  99.      * Builds a new {@link BOMInputStream}.
  100.      *
  101.      * <h2>Using NIO</h2>
  102.      * <pre>{@code
  103.      * BOMInputStream s = BOMInputStream.builder()
  104.      *   .setPath(Paths.get("MyFile.xml"))
  105.      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
  106.      *   .setInclude(false)
  107.      *   .get();}
  108.      * </pre>
  109.      * <h2>Using IO</h2>
  110.      * <pre>{@code
  111.      * BOMInputStream s = BOMInputStream.builder()
  112.      *   .setFile(new File("MyFile.xml"))
  113.      *   .setByteOrderMarks(ByteOrderMark.UTF_8)
  114.      *   .setInclude(false)
  115.      *   .get();}
  116.      * </pre>
  117.      *
  118.      * @see #get()
  119.      * @since 2.12.0
  120.      */
  121.     // @formatter:on
  122.     public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {

  123.         private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };

  124.         /**
  125.          * For test access.
  126.          *
  127.          * @return the default byte order mark
  128.          */
  129.         static ByteOrderMark getDefaultByteOrderMark() {
  130.             return DEFAULT[0];
  131.         }

  132.         private ByteOrderMark[] byteOrderMarks = DEFAULT;

  133.         private boolean include;

  134.         /**
  135.          * Constructs a new builder of {@link BOMInputStream}.
  136.          */
  137.         public Builder() {
  138.             // empty
  139.         }

  140.         /**
  141.          * Builds a new {@link BOMInputStream}.
  142.          * <p>
  143.          * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
  144.          * </p>
  145.          * <p>
  146.          * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
  147.          * </p>
  148.          * <p>
  149.          * This builder uses the following aspects:
  150.          * </p>
  151.          * <ul>
  152.          * <li>{@link #getInputStream()}</li>
  153.          * <li>include}</li>
  154.          * <li>byteOrderMarks</li>
  155.          * </ul>
  156.          *
  157.          * @return a new instance.
  158.          * @throws IllegalStateException         if the {@code origin} is {@code null}.
  159.          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
  160.          * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
  161.          * @see #getInputStream()
  162.          * @see #getUnchecked()
  163.          */
  164.         @Override
  165.         public BOMInputStream get() throws IOException {
  166.             return new BOMInputStream(this);
  167.         }

  168.         /**
  169.          * Sets the ByteOrderMarks to detect and optionally exclude.
  170.          * <p>
  171.          * The default is {@link ByteOrderMark#UTF_8}.
  172.          * </p>
  173.          *
  174.          * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
  175.          * @return {@code this} instance.
  176.          */
  177.         public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
  178.             this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
  179.             return this;
  180.         }

  181.         /**
  182.          * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
  183.          * <p>
  184.          * The default is false.
  185.          * </p>
  186.          *
  187.          * @param include true to include the UTF-8 BOM or false to exclude it. return this;
  188.          * @return {@code this} instance.
  189.          */
  190.         public Builder setInclude(final boolean include) {
  191.             this.include = include;
  192.             return this;
  193.         }

  194.     }

  195.     /**
  196.      * Compares ByteOrderMark objects in descending length order.
  197.      */
  198.     private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();

  199.     /**
  200.      * Constructs a new {@link Builder}.
  201.      *
  202.      * @return a new {@link Builder}.
  203.      * @since 2.12.0
  204.      */
  205.     public static Builder builder() {
  206.         return new Builder();
  207.     }

  208.     /**
  209.      * BOMs are sorted from longest to shortest.
  210.      */
  211.     private final List<ByteOrderMark> bomList;

  212.     private ByteOrderMark byteOrderMark;
  213.     private int fbIndex;
  214.     private int fbLength;
  215.     private int[] firstBytes;
  216.     private final boolean include;
  217.     private boolean markedAtStart;
  218.     private int markFbIndex;

  219.     private BOMInputStream(final Builder builder) throws IOException {
  220.         super(builder);
  221.         if (IOUtils.length(builder.byteOrderMarks) == 0) {
  222.             throw new IllegalArgumentException("No ByteOrderMark specified.");
  223.         }
  224.         this.include = builder.include;
  225.         final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
  226.         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
  227.         list.sort(ByteOrderMarkLengthComparator);
  228.         this.bomList = list;
  229.     }

  230.     /**
  231.      * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
  232.      *
  233.      * @param delegate
  234.      *            the InputStream to delegate to
  235.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  236.      */
  237.     @Deprecated
  238.     public BOMInputStream(final InputStream delegate) {
  239.         this(delegate, false, Builder.DEFAULT);
  240.     }

  241.     /**
  242.      * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
  243.      *
  244.      * @param delegate
  245.      *            the InputStream to delegate to
  246.      * @param include
  247.      *            true to include the UTF-8 BOM or false to exclude it
  248.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  249.      */
  250.     @Deprecated
  251.     public BOMInputStream(final InputStream delegate, final boolean include) {
  252.         this(delegate, include, Builder.DEFAULT);
  253.     }

  254.     /**
  255.      * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
  256.      *
  257.      * @param delegate
  258.      *            the InputStream to delegate to
  259.      * @param include
  260.      *            true to include the specified BOMs or false to exclude them
  261.      * @param boms
  262.      *            The BOMs to detect and optionally exclude
  263.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  264.      */
  265.     @Deprecated
  266.     public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
  267.         super(delegate);
  268.         if (IOUtils.length(boms) == 0) {
  269.             throw new IllegalArgumentException("No BOMs specified");
  270.         }
  271.         this.include = include;
  272.         final List<ByteOrderMark> list = Arrays.asList(boms);
  273.         // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
  274.         list.sort(ByteOrderMarkLengthComparator);
  275.         this.bomList = list;
  276.     }

  277.     /**
  278.      * Constructs a new BOM InputStream that excludes the specified BOMs.
  279.      *
  280.      * @param delegate
  281.      *            the InputStream to delegate to
  282.      * @param boms
  283.      *            The BOMs to detect and exclude
  284.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  285.      */
  286.     @Deprecated
  287.     public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
  288.         this(delegate, false, boms);
  289.     }

  290.     /**
  291.      * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
  292.      *
  293.      * @return The matched BOM or null if none matched.
  294.      */
  295.     private ByteOrderMark find() {
  296.         return bomList.stream().filter(this::matches).findFirst().orElse(null);
  297.     }

  298.     /**
  299.      * Gets the ByteOrderMark (Byte Order Mark).
  300.      *
  301.      * @return The BOM or null if none matched.
  302.      * @throws IOException
  303.      *             if an error reading the first bytes of the stream occurs.
  304.      */
  305.     public ByteOrderMark getBOM() throws IOException {
  306.         if (firstBytes == null) {
  307.             byteOrderMark = readBom();
  308.         }
  309.         return byteOrderMark;
  310.     }

  311.     /**
  312.      * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
  313.      *
  314.      * @return The BOM charset Name or null if no BOM found
  315.      * @throws IOException
  316.      *             if an error reading the first bytes of the stream occurs
  317.      */
  318.     public String getBOMCharsetName() throws IOException {
  319.         getBOM();
  320.         return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
  321.     }

  322.     /**
  323.      * Tests whether the stream contains one of the specified BOMs.
  324.      *
  325.      * @return true if the stream has one of the specified BOMs, otherwise false if it does not
  326.      * @throws IOException
  327.      *             if an error reading the first bytes of the stream occurs
  328.      */
  329.     public boolean hasBOM() throws IOException {
  330.         return getBOM() != null;
  331.     }

  332.     /**
  333.      * Tests whether the stream contains the specified BOM.
  334.      *
  335.      * @param bom
  336.      *            The BOM to check for
  337.      * @return true if the stream has the specified BOM, otherwise false if it does not
  338.      * @throws IllegalArgumentException
  339.      *             if the BOM is not one the stream is configured to detect
  340.      * @throws IOException
  341.      *             if an error reading the first bytes of the stream occurs
  342.      */
  343.     public boolean hasBOM(final ByteOrderMark bom) throws IOException {
  344.         if (!bomList.contains(bom)) {
  345.             throw new IllegalArgumentException("Stream not configured to detect " + bom);
  346.         }
  347.         return Objects.equals(getBOM(), bom);
  348.     }

  349.     /**
  350.      * Invokes the delegate's {@code mark(int)} method.
  351.      *
  352.      * @param readLimit
  353.      *            read ahead limit
  354.      */
  355.     @Override
  356.     public synchronized void mark(final int readLimit) {
  357.         markFbIndex = fbIndex;
  358.         markedAtStart = firstBytes == null;
  359.         in.mark(readLimit);
  360.     }

  361.     /**
  362.      * Checks if the bytes match a BOM.
  363.      *
  364.      * @param bom
  365.      *            The BOM
  366.      * @return true if the bytes match the bom, otherwise false
  367.      */
  368.     private boolean matches(final ByteOrderMark bom) {
  369.         return bom.matches(firstBytes);
  370.     }

  371.     /**
  372.      * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
  373.      *
  374.      * @return the byte read (excluding BOM) or -1 if the end of stream
  375.      * @throws IOException
  376.      *             if an I/O error occurs
  377.      */
  378.     @Override
  379.     public int read() throws IOException {
  380.         checkOpen();
  381.         final int b = readFirstBytes();
  382.         return b >= 0 ? b : in.read();
  383.     }

  384.     /**
  385.      * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
  386.      *
  387.      * @param buf
  388.      *            the buffer to read the bytes into
  389.      * @return the number of bytes read (excluding BOM) or -1 if the end of stream
  390.      * @throws IOException
  391.      *             if an I/O error occurs
  392.      */
  393.     @Override
  394.     public int read(final byte[] buf) throws IOException {
  395.         return read(buf, 0, buf.length);
  396.     }

  397.     /**
  398.      * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
  399.      *
  400.      * @param buf
  401.      *            the buffer to read the bytes into
  402.      * @param off
  403.      *            The start offset
  404.      * @param len
  405.      *            The number of bytes to read (excluding BOM)
  406.      * @return the number of bytes read or -1 if the end of stream
  407.      * @throws IOException
  408.      *             if an I/O error occurs
  409.      */
  410.     @Override
  411.     public int read(final byte[] buf, int off, int len) throws IOException {
  412.         int firstCount = 0;
  413.         int b = 0;
  414.         while (len > 0 && b >= 0) {
  415.             b = readFirstBytes();
  416.             if (b >= 0) {
  417.                 buf[off++] = (byte) (b & 0xFF);
  418.                 len--;
  419.                 firstCount++;
  420.             }
  421.         }
  422.         final int secondCount = in.read(buf, off, len);
  423.         afterRead(secondCount);
  424.         return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
  425.     }

  426.     private ByteOrderMark readBom() throws IOException {
  427.         fbLength = 0;
  428.         // BOMs are sorted from longest to shortest
  429.         final int maxBomSize = bomList.get(0).length();
  430.         firstBytes = new int[maxBomSize];
  431.         // Read first maxBomSize bytes
  432.         for (int i = 0; i < firstBytes.length; i++) {
  433.             firstBytes[i] = in.read();
  434.             afterRead(firstBytes[i]);
  435.             fbLength++;
  436.             if (firstBytes[i] < 0) {
  437.                 break;
  438.             }
  439.         }
  440.         // match BOM in firstBytes
  441.         final ByteOrderMark bom = find();
  442.         if (bom != null && !include) {
  443.             if (bom.length() < firstBytes.length) {
  444.                 fbIndex = bom.length();
  445.             } else {
  446.                 fbLength = 0;
  447.             }
  448.         }
  449.         return bom;
  450.     }

  451.     /**
  452.      * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
  453.      * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
  454.      * processed already.
  455.      *
  456.      * @return the byte read (excluding BOM) or -1 if the end of stream
  457.      * @throws IOException
  458.      *             if an I/O error occurs
  459.      */
  460.     private int readFirstBytes() throws IOException {
  461.         getBOM();
  462.         return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
  463.     }

  464.     /**
  465.      * Invokes the delegate's {@code reset()} method.
  466.      *
  467.      * @throws IOException
  468.      *             if an I/O error occurs
  469.      */
  470.     @Override
  471.     public synchronized void reset() throws IOException {
  472.         fbIndex = markFbIndex;
  473.         if (markedAtStart) {
  474.             firstBytes = null;
  475.         }
  476.         in.reset();
  477.     }

  478.     /**
  479.      * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
  480.      *
  481.      * @param n
  482.      *            the number of bytes to skip
  483.      * @return the number of bytes to skipped or -1 if the end of stream
  484.      * @throws IOException
  485.      *             if an I/O error occurs
  486.      */
  487.     @Override
  488.     public long skip(final long n) throws IOException {
  489.         int skipped = 0;
  490.         while (n > skipped && readFirstBytes() >= 0) {
  491.             skipped++;
  492.         }
  493.         return in.skip(n - skipped) + skipped;
  494.     }
  495. }