ReaderInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.io.input;

  18. import static org.apache.commons.io.IOUtils.EOF;

  19. import java.io.BufferedReader;
  20. import java.io.IOException;
  21. import java.io.InputStream;
  22. import java.io.InputStreamReader;
  23. import java.io.OutputStreamWriter;
  24. import java.io.Reader;
  25. import java.nio.ByteBuffer;
  26. import java.nio.CharBuffer;
  27. import java.nio.charset.Charset;
  28. import java.nio.charset.CharsetEncoder;
  29. import java.nio.charset.CoderResult;
  30. import java.nio.charset.CodingErrorAction;
  31. import java.util.Objects;

  32. import org.apache.commons.io.Charsets;
  33. import org.apache.commons.io.IOUtils;
  34. import org.apache.commons.io.build.AbstractStreamBuilder;
  35. import org.apache.commons.io.charset.CharsetEncoders;

  36. /**
  37.  * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
  38.  * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
  39.  * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
  40.  * <p>
  41.  * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
  42.  * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
  43.  * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
  44.  * {@link BufferedReader}.
  45.  * </p>
  46.  * <p>
  47.  * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
  48.  * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
  49.  * </p>
  50.  * <p>
  51.  * To build an instance, use {@link Builder}.
  52.  * </p>
  53.  * <pre>
  54.  * InputStream inputStream = ...
  55.  * Charset cs = ...
  56.  * InputStreamReader reader = new InputStreamReader(inputStream, cs);
  57.  * ReaderInputStream in2 = ReaderInputStream.builder()
  58.  *   .setReader(reader)
  59.  *   .setCharset(cs)
  60.  *   .get();
  61.  * </pre>
  62.  * <p>
  63.  * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
  64.  * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
  65.  * pulls it from the underlying stream.
  66.  * </p>
  67.  * <p>
  68.  * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
  69.  * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
  70.  * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
  71.  * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
  72.  * </p>
  73.  * <p>
  74.  * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
  75.  * </p>
  76.  * <p>
  77.  * Instances of {@link ReaderInputStream} are not thread safe.
  78.  * </p>
  79.  *
  80.  * @see Builder
  81.  * @see org.apache.commons.io.output.WriterOutputStream
  82.  * @since 2.0
  83.  */
  84. public class ReaderInputStream extends AbstractInputStream {

  85.     // @formatter:off
  86.     /**
  87.      * Builds a new {@link ReaderInputStream}.
  88.      *
  89.      * <p>
  90.      * For example:
  91.      * </p>
  92.      * <pre>{@code
  93.      * ReaderInputStream s = ReaderInputStream.builder()
  94.      *   .setPath(path)
  95.      *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
  96.      *   .get();}
  97.      * </pre>
  98.      *
  99.      * @see #get()
  100.      * @since 2.12.0
  101.      */
  102.     // @formatter:on
  103.     public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {

  104.         private CharsetEncoder charsetEncoder = newEncoder(getCharset());

  105.         /**
  106.          * Constructs a new builder of {@link ReaderInputStream}.
  107.          */
  108.         public Builder() {
  109.             // empty
  110.         }

  111.         /**
  112.          * Builds a new {@link ReaderInputStream}.
  113.          *
  114.          * <p>
  115.          * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception.
  116.          * </p>
  117.          * <p>
  118.          * This builder uses the following aspects:
  119.          * </p>
  120.          * <ul>
  121.          * <li>{@link #getReader()} gets the target aspect.</li>
  122.          * <li>{@link #getBufferSize()}</li>
  123.          * <li>{@link #getCharset()}</li>
  124.          * <li>{@link CharsetEncoder}</li>
  125.          * </ul>
  126.          *
  127.          * @return a new instance.
  128.          * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}.
  129.          * @throws IllegalStateException         if the {@code origin} is {@code null}.
  130.          * @throws IOException                   if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}.
  131.          * @see #getReader()
  132.          * @see CharsetEncoder
  133.          * @see #getBufferSize()
  134.          * @see #getUnchecked()
  135.          */
  136.         @Override
  137.         public ReaderInputStream get() throws IOException {
  138.             return new ReaderInputStream(this);
  139.         }

  140.         CharsetEncoder getCharsetEncoder() {
  141.             return charsetEncoder;
  142.         }

  143.         @Override
  144.         public Builder setCharset(final Charset charset) {
  145.             super.setCharset(charset);
  146.             charsetEncoder = newEncoder(getCharset());
  147.             return this;
  148.         }

  149.         /**
  150.          * Sets the charset encoder. Assumes that the caller has configured the encoder.
  151.          *
  152.          * @param newEncoder the charset encoder, null resets to a default encoder.
  153.          * @return {@code this} instance.
  154.          */
  155.         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
  156.             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
  157.             super.setCharset(charsetEncoder.charset());
  158.             return this;
  159.         }

  160.     }

  161.     /**
  162.      * Constructs a new {@link Builder}.
  163.      *
  164.      * @return a new {@link Builder}.
  165.      * @since 2.12.0
  166.      */
  167.     public static Builder builder() {
  168.         return new Builder();
  169.     }

  170.     static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
  171.         final float minRequired = minBufferSize(charsetEncoder);
  172.         if (bufferSize < minRequired) {
  173.             throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
  174.                     charsetEncoder.charset().displayName()));
  175.         }
  176.         return bufferSize;
  177.     }

  178.     static float minBufferSize(final CharsetEncoder charsetEncoder) {
  179.         return charsetEncoder.maxBytesPerChar() * 2;
  180.     }

  181.     private static CharsetEncoder newEncoder(final Charset charset) {
  182.         // @formatter:off
  183.         return Charsets.toCharset(charset).newEncoder()
  184.                 .onMalformedInput(CodingErrorAction.REPLACE)
  185.                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
  186.         // @formatter:on
  187.     }

  188.     private final Reader reader;

  189.     private final CharsetEncoder charsetEncoder;

  190.     /**
  191.      * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
  192.      */
  193.     private final CharBuffer encoderIn;
  194.     /**
  195.      * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
  196.      * caller.
  197.      */
  198.     private final ByteBuffer encoderOut;

  199.     private CoderResult lastCoderResult;

  200.     private boolean endOfInput;

  201.     @SuppressWarnings("resource") // caller closes.
  202.     private ReaderInputStream(final Builder builder) throws IOException {
  203.         this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize());
  204.     }

  205.     /**
  206.      * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@link Charset#defaultCharset() default charset} with a default input buffer
  207.      * size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
  208.      *
  209.      * @param reader the target {@link Reader}
  210.      * @deprecated Use {@link ReaderInputStream#builder()} instead
  211.      */
  212.     @Deprecated
  213.     public ReaderInputStream(final Reader reader) {
  214.         this(reader, Charset.defaultCharset());
  215.     }

  216.     /**
  217.      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
  218.      *
  219.      * <p>
  220.      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
  221.      * </p>
  222.      *
  223.      * @param reader  the target {@link Reader}
  224.      * @param charset the charset encoding
  225.      * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
  226.      */
  227.     @Deprecated
  228.     public ReaderInputStream(final Reader reader, final Charset charset) {
  229.         this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
  230.     }

  231.     /**
  232.      * Constructs a new {@link ReaderInputStream}.
  233.      *
  234.      * <p>
  235.      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
  236.      * </p>
  237.      *
  238.      * @param reader     the target {@link Reader}.
  239.      * @param charset    the charset encoding.
  240.      * @param bufferSize the size of the input buffer in number of characters.
  241.      * @deprecated Use {@link ReaderInputStream#builder()} instead
  242.      */
  243.     @Deprecated
  244.     public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
  245.         // @formatter:off
  246.         this(reader,
  247.             Charsets.toCharset(charset).newEncoder()
  248.                     .onMalformedInput(CodingErrorAction.REPLACE)
  249.                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
  250.              bufferSize);
  251.         // @formatter:on
  252.     }

  253.     /**
  254.      * Constructs a new {@link ReaderInputStream}.
  255.      *
  256.      * <p>
  257.      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
  258.      * an encoder which had already been in use.
  259.      * </p>
  260.      *
  261.      * @param reader         the target {@link Reader}
  262.      * @param charsetEncoder the charset encoder
  263.      * @since 2.1
  264.      * @deprecated Use {@link ReaderInputStream#builder()} instead
  265.      */
  266.     @Deprecated
  267.     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
  268.         this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
  269.     }

  270.     /**
  271.      * Constructs a new {@link ReaderInputStream}.
  272.      *
  273.      * <p>
  274.      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
  275.      * an encoder which had already been in use.
  276.      * </p>
  277.      *
  278.      * @param reader         the target {@link Reader}
  279.      * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
  280.      * @param bufferSize     the size of the input buffer in number of characters
  281.      * @since 2.1
  282.      * @deprecated Use {@link ReaderInputStream#builder()} instead
  283.      */
  284.     @Deprecated
  285.     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
  286.         this.reader = reader;
  287.         this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
  288.         this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
  289.         this.encoderIn.flip();
  290.         this.encoderOut = ByteBuffer.allocate(128);
  291.         this.encoderOut.flip();
  292.     }

  293.     /**
  294.      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
  295.      *
  296.      * <p>
  297.      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
  298.      * </p>
  299.      *
  300.      * @param reader      the target {@link Reader}
  301.      * @param charsetName the name of the charset encoding
  302.      * @deprecated Use {@link ReaderInputStream#builder()} instead
  303.      */
  304.     @Deprecated
  305.     public ReaderInputStream(final Reader reader, final String charsetName) {
  306.         this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
  307.     }

  308.     /**
  309.      * Constructs a new {@link ReaderInputStream}.
  310.      *
  311.      * <p>
  312.      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
  313.      * </p>
  314.      *
  315.      * @param reader      the target {@link Reader}
  316.      * @param charsetName the name of the charset encoding, null maps to the default Charset.
  317.      * @param bufferSize  the size of the input buffer in number of characters
  318.      * @deprecated Use {@link ReaderInputStream#builder()} instead
  319.      */
  320.     @Deprecated
  321.     public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
  322.         this(reader, Charsets.toCharset(charsetName), bufferSize);
  323.     }

  324.     @Override
  325.     public int available() throws IOException {
  326.         if (encoderOut.hasRemaining()) {
  327.             return encoderOut.remaining();
  328.         }
  329.         return 0;
  330.     }

  331.     /**
  332.      * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
  333.      *
  334.      * @throws IOException if an I/O error occurs.
  335.      */
  336.     @Override
  337.     public void close() throws IOException {
  338.         reader.close();
  339.         super.close();
  340.     }

  341.     /**
  342.      * Fills the internal char buffer from the reader.
  343.      *
  344.      * @throws IOException If an I/O error occurs
  345.      */
  346.     private void fillBuffer() throws IOException {
  347.         if (endOfInput) {
  348.             return;
  349.         }
  350.         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
  351.             encoderIn.compact();
  352.             final int position = encoderIn.position();
  353.             // We don't use Reader#read(CharBuffer) here because it is more efficient
  354.             // to write directly to the underlying char array (the default implementation
  355.             // copies data to a temporary char array).
  356.             final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
  357.             if (c == EOF) {
  358.                 endOfInput = true;
  359.             } else {
  360.                 encoderIn.position(position + c);
  361.             }
  362.             encoderIn.flip();
  363.         }
  364.         encoderOut.compact();
  365.         lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
  366.         if (endOfInput) {
  367.             lastCoderResult = charsetEncoder.flush(encoderOut);
  368.         }
  369.         if (lastCoderResult.isError()) {
  370.             lastCoderResult.throwException();
  371.         }
  372.         encoderOut.flip();
  373.     }

  374.     /**
  375.      * Gets the CharsetEncoder.
  376.      *
  377.      * @return the CharsetEncoder.
  378.      */
  379.     CharsetEncoder getCharsetEncoder() {
  380.         return charsetEncoder;
  381.     }

  382.     /**
  383.      * Reads a single byte.
  384.      *
  385.      * @return either the byte read or {@code -1} if the end of the stream has been reached
  386.      * @throws IOException if an I/O error occurs.
  387.      */
  388.     @Override
  389.     public int read() throws IOException {
  390.         checkOpen();
  391.         for (;;) {
  392.             if (encoderOut.hasRemaining()) {
  393.                 return encoderOut.get() & 0xFF;
  394.             }
  395.             fillBuffer();
  396.             if (endOfInput && !encoderOut.hasRemaining()) {
  397.                 return EOF;
  398.             }
  399.         }
  400.     }

  401.     /**
  402.      * Reads the specified number of bytes into an array.
  403.      *
  404.      * @param b the byte array to read into
  405.      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
  406.      * @throws IOException if an I/O error occurs.
  407.      */
  408.     @Override
  409.     public int read(final byte[] b) throws IOException {
  410.         return read(b, 0, b.length);
  411.     }

  412.     /**
  413.      * Reads the specified number of bytes into an array.
  414.      *
  415.      * @param array the byte array to read into
  416.      * @param off   the offset to start reading bytes into
  417.      * @param len   the number of bytes to read
  418.      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
  419.      * @throws IOException if an I/O error occurs.
  420.      */
  421.     @Override
  422.     public int read(final byte[] array, int off, int len) throws IOException {
  423.         Objects.requireNonNull(array, "array");
  424.         if (len < 0 || off < 0 || off + len > array.length) {
  425.             throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
  426.         }
  427.         int read = 0;
  428.         if (len == 0) {
  429.             return 0; // Always return 0 if len == 0
  430.         }
  431.         while (len > 0) {
  432.             if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
  433.                 final int c = Math.min(encoderOut.remaining(), len);
  434.                 encoderOut.get(array, off, c);
  435.                 off += c;
  436.                 len -= c;
  437.                 read += c;
  438.             } else if (endOfInput) { // Already reach EOF in the last read
  439.                 break;
  440.             } else { // Read again
  441.                 fillBuffer();
  442.             }
  443.         }
  444.         return read == 0 && endOfInput ? EOF : read;
  445.     }
  446. }