001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.BufferedReader;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.OutputStreamWriter;
026import java.io.Reader;
027import java.nio.ByteBuffer;
028import java.nio.CharBuffer;
029import java.nio.charset.Charset;
030import java.nio.charset.CharsetEncoder;
031import java.nio.charset.CoderResult;
032import java.nio.charset.CodingErrorAction;
033import java.util.Objects;
034
035import org.apache.commons.io.Charsets;
036import org.apache.commons.io.IOUtils;
037import org.apache.commons.io.build.AbstractStreamBuilder;
038import org.apache.commons.io.charset.CharsetEncoders;
039
040/**
041 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
042 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
043 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
044 * <p>
045 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
046 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
047 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
048 * {@link BufferedReader}.
049 * </p>
050 * <p>
051 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
052 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
053 * </p>
054 * <p>
055 * To build an instance, use {@link Builder}.
056 * </p>
057 * <pre>
058 * InputStream inputStream = ...
059 * Charset cs = ...
060 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
061 * ReaderInputStream in2 = ReaderInputStream.builder()
062 *   .setReader(reader)
063 *   .setCharset(cs)
064 *   .get();
065 * </pre>
066 * <p>
067 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
068 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
069 * pulls it from the underlying stream.
070 * </p>
071 * <p>
072 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
073 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
074 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
075 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
076 * </p>
077 * <p>
078 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
079 * </p>
080 * <p>
081 * Instances of {@link ReaderInputStream} are not thread safe.
082 * </p>
083 *
084 * @see Builder
085 * @see org.apache.commons.io.output.WriterOutputStream
086 * @since 2.0
087 */
088public class ReaderInputStream extends AbstractInputStream {
089
090    // @formatter:off
091    /**
092     * Builds a new {@link ReaderInputStream}.
093     *
094     * <p>
095     * For example:
096     * </p>
097     * <pre>{@code
098     * ReaderInputStream s = ReaderInputStream.builder()
099     *   .setPath(path)
100     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
101     *   .get();}
102     * </pre>
103     *
104     * @see #get()
105     * @since 2.12.0
106     */
107    // @formatter:on
108    public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
109
110        private CharsetEncoder charsetEncoder = newEncoder(getCharset());
111
112        /**
113         * Constructs a new builder of {@link ReaderInputStream}.
114         */
115        public Builder() {
116            // empty
117        }
118
119        /**
120         * Builds a new {@link ReaderInputStream}.
121         *
122         * <p>
123         * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception.
124         * </p>
125         * <p>
126         * This builder uses the following aspects:
127         * </p>
128         * <ul>
129         * <li>{@link #getReader()} gets the target aspect.</li>
130         * <li>{@link #getBufferSize()}</li>
131         * <li>{@link #getCharset()}</li>
132         * <li>{@link CharsetEncoder}</li>
133         * </ul>
134         *
135         * @return a new instance.
136         * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}.
137         * @throws IllegalStateException         if the {@code origin} is {@code null}.
138         * @throws IOException                   if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}.
139         * @see #getReader()
140         * @see CharsetEncoder
141         * @see #getBufferSize()
142         * @see #getUnchecked()
143         */
144        @Override
145        public ReaderInputStream get() throws IOException {
146            return new ReaderInputStream(this);
147        }
148
149        CharsetEncoder getCharsetEncoder() {
150            return charsetEncoder;
151        }
152
153        @Override
154        public Builder setCharset(final Charset charset) {
155            super.setCharset(charset);
156            charsetEncoder = newEncoder(getCharset());
157            return this;
158        }
159
160        /**
161         * Sets the charset encoder. Assumes that the caller has configured the encoder.
162         *
163         * @param newEncoder the charset encoder, null resets to a default encoder.
164         * @return {@code this} instance.
165         */
166        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
167            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
168            super.setCharset(charsetEncoder.charset());
169            return this;
170        }
171
172    }
173
174    /**
175     * Constructs a new {@link Builder}.
176     *
177     * @return a new {@link Builder}.
178     * @since 2.12.0
179     */
180    public static Builder builder() {
181        return new Builder();
182    }
183
184    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
185        final float minRequired = minBufferSize(charsetEncoder);
186        if (bufferSize < minRequired) {
187            throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
188                    charsetEncoder.charset().displayName()));
189        }
190        return bufferSize;
191    }
192
193    static float minBufferSize(final CharsetEncoder charsetEncoder) {
194        return charsetEncoder.maxBytesPerChar() * 2;
195    }
196
197    private static CharsetEncoder newEncoder(final Charset charset) {
198        // @formatter:off
199        return Charsets.toCharset(charset).newEncoder()
200                .onMalformedInput(CodingErrorAction.REPLACE)
201                .onUnmappableCharacter(CodingErrorAction.REPLACE);
202        // @formatter:on
203    }
204
205    private final Reader reader;
206
207    private final CharsetEncoder charsetEncoder;
208
209    /**
210     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
211     */
212    private final CharBuffer encoderIn;
213    /**
214     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
215     * caller.
216     */
217    private final ByteBuffer encoderOut;
218
219    private CoderResult lastCoderResult;
220
221    private boolean endOfInput;
222
223    @SuppressWarnings("resource") // caller closes.
224    private ReaderInputStream(final Builder builder) throws IOException {
225        this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize());
226    }
227
228    /**
229     * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@link Charset#defaultCharset() default charset} with a default input buffer
230     * size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
231     *
232     * @param reader the target {@link Reader}
233     * @deprecated Use {@link ReaderInputStream#builder()} instead
234     */
235    @Deprecated
236    public ReaderInputStream(final Reader reader) {
237        this(reader, Charset.defaultCharset());
238    }
239
240    /**
241     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
242     *
243     * <p>
244     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
245     * </p>
246     *
247     * @param reader  the target {@link Reader}
248     * @param charset the charset encoding
249     * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
250     */
251    @Deprecated
252    public ReaderInputStream(final Reader reader, final Charset charset) {
253        this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
254    }
255
256    /**
257     * Constructs a new {@link ReaderInputStream}.
258     *
259     * <p>
260     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
261     * </p>
262     *
263     * @param reader     the target {@link Reader}.
264     * @param charset    the charset encoding.
265     * @param bufferSize the size of the input buffer in number of characters.
266     * @deprecated Use {@link ReaderInputStream#builder()} instead
267     */
268    @Deprecated
269    public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
270        // @formatter:off
271        this(reader,
272            Charsets.toCharset(charset).newEncoder()
273                    .onMalformedInput(CodingErrorAction.REPLACE)
274                    .onUnmappableCharacter(CodingErrorAction.REPLACE),
275             bufferSize);
276        // @formatter:on
277    }
278
279    /**
280     * Constructs a new {@link ReaderInputStream}.
281     *
282     * <p>
283     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
284     * an encoder which had already been in use.
285     * </p>
286     *
287     * @param reader         the target {@link Reader}
288     * @param charsetEncoder the charset encoder
289     * @since 2.1
290     * @deprecated Use {@link ReaderInputStream#builder()} instead
291     */
292    @Deprecated
293    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
294        this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
295    }
296
297    /**
298     * Constructs a new {@link ReaderInputStream}.
299     *
300     * <p>
301     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
302     * an encoder which had already been in use.
303     * </p>
304     *
305     * @param reader         the target {@link Reader}
306     * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
307     * @param bufferSize     the size of the input buffer in number of characters
308     * @since 2.1
309     * @deprecated Use {@link ReaderInputStream#builder()} instead
310     */
311    @Deprecated
312    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
313        this.reader = reader;
314        this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
315        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
316        this.encoderIn.flip();
317        this.encoderOut = ByteBuffer.allocate(128);
318        this.encoderOut.flip();
319    }
320
321    /**
322     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
323     *
324     * <p>
325     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
326     * </p>
327     *
328     * @param reader      the target {@link Reader}
329     * @param charsetName the name of the charset encoding
330     * @deprecated Use {@link ReaderInputStream#builder()} instead
331     */
332    @Deprecated
333    public ReaderInputStream(final Reader reader, final String charsetName) {
334        this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
335    }
336
337    /**
338     * Constructs a new {@link ReaderInputStream}.
339     *
340     * <p>
341     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
342     * </p>
343     *
344     * @param reader      the target {@link Reader}
345     * @param charsetName the name of the charset encoding, null maps to the default Charset.
346     * @param bufferSize  the size of the input buffer in number of characters
347     * @deprecated Use {@link ReaderInputStream#builder()} instead
348     */
349    @Deprecated
350    public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
351        this(reader, Charsets.toCharset(charsetName), bufferSize);
352    }
353
354    @Override
355    public int available() throws IOException {
356        if (encoderOut.hasRemaining()) {
357            return encoderOut.remaining();
358        }
359        return 0;
360    }
361
362    /**
363     * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
364     *
365     * @throws IOException if an I/O error occurs.
366     */
367    @Override
368    public void close() throws IOException {
369        reader.close();
370        super.close();
371    }
372
373    /**
374     * Fills the internal char buffer from the reader.
375     *
376     * @throws IOException If an I/O error occurs
377     */
378    private void fillBuffer() throws IOException {
379        if (endOfInput) {
380            return;
381        }
382        if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
383            encoderIn.compact();
384            final int position = encoderIn.position();
385            // We don't use Reader#read(CharBuffer) here because it is more efficient
386            // to write directly to the underlying char array (the default implementation
387            // copies data to a temporary char array).
388            final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
389            if (c == EOF) {
390                endOfInput = true;
391            } else {
392                encoderIn.position(position + c);
393            }
394            encoderIn.flip();
395        }
396        encoderOut.compact();
397        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
398        if (endOfInput) {
399            lastCoderResult = charsetEncoder.flush(encoderOut);
400        }
401        if (lastCoderResult.isError()) {
402            lastCoderResult.throwException();
403        }
404        encoderOut.flip();
405    }
406
407    /**
408     * Gets the CharsetEncoder.
409     *
410     * @return the CharsetEncoder.
411     */
412    CharsetEncoder getCharsetEncoder() {
413        return charsetEncoder;
414    }
415
416    /**
417     * Reads a single byte.
418     *
419     * @return either the byte read or {@code -1} if the end of the stream has been reached
420     * @throws IOException if an I/O error occurs.
421     */
422    @Override
423    public int read() throws IOException {
424        checkOpen();
425        for (;;) {
426            if (encoderOut.hasRemaining()) {
427                return encoderOut.get() & 0xFF;
428            }
429            fillBuffer();
430            if (endOfInput && !encoderOut.hasRemaining()) {
431                return EOF;
432            }
433        }
434    }
435
436    /**
437     * Reads the specified number of bytes into an array.
438     *
439     * @param b the byte array to read into
440     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
441     * @throws IOException if an I/O error occurs.
442     */
443    @Override
444    public int read(final byte[] b) throws IOException {
445        return read(b, 0, b.length);
446    }
447
448    /**
449     * Reads the specified number of bytes into an array.
450     *
451     * @param array the byte array to read into
452     * @param off   the offset to start reading bytes into
453     * @param len   the number of bytes to read
454     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
455     * @throws IOException if an I/O error occurs.
456     */
457    @Override
458    public int read(final byte[] array, int off, int len) throws IOException {
459        Objects.requireNonNull(array, "array");
460        if (len < 0 || off < 0 || off + len > array.length) {
461            throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
462        }
463        int read = 0;
464        if (len == 0) {
465            return 0; // Always return 0 if len == 0
466        }
467        while (len > 0) {
468            if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
469                final int c = Math.min(encoderOut.remaining(), len);
470                encoderOut.get(array, off, c);
471                off += c;
472                len -= c;
473                read += c;
474            } else if (endOfInput) { // Already reach EOF in the last read
475                break;
476            } else { // Read again
477                fillBuffer();
478            }
479        }
480        return read == 0 && endOfInput ? EOF : read;
481    }
482}