001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.BufferedReader;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.OutputStreamWriter;
026import java.io.Reader;
027import java.nio.ByteBuffer;
028import java.nio.CharBuffer;
029import java.nio.charset.Charset;
030import java.nio.charset.CharsetEncoder;
031import java.nio.charset.CoderResult;
032import java.nio.charset.CodingErrorAction;
033
034import org.apache.commons.io.Charsets;
035import org.apache.commons.io.IOUtils;
036import org.apache.commons.io.build.AbstractStreamBuilder;
037import org.apache.commons.io.charset.CharsetEncoders;
038
039/**
040 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
041 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
042 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
043 * <p>
044 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
045 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
046 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
047 * {@link BufferedReader}.
048 * </p>
049 * <p>
050 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
051 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
052 * </p>
053 * <p>
054 * To build an instance, use {@link Builder}.
055 * </p>
056 * <pre>
057 * InputStream inputStream = ...
058 * Charset cs = ...
059 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
060 * ReaderInputStream in2 = ReaderInputStream.builder()
061 *   .setReader(reader)
062 *   .setCharset(cs)
063 *   .get();
064 * </pre>
065 * <p>
066 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
067 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
068 * pulls it from the underlying stream.
069 * </p>
070 * <p>
071 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
072 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
073 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
074 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
075 * </p>
076 * <p>
077 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
078 * </p>
079 * <p>
080 * Instances of {@link ReaderInputStream} are not thread safe.
081 * </p>
082 *
083 * @see Builder
084 * @see org.apache.commons.io.output.WriterOutputStream
085 * @since 2.0
086 */
087public class ReaderInputStream extends AbstractInputStream {
088
089    // @formatter:off
090    /**
091     * Builds a new {@link ReaderInputStream}.
092     *
093     * <p>
094     * For example:
095     * </p>
096     * <pre>{@code
097     * ReaderInputStream s = ReaderInputStream.builder()
098     *   .setPath(path)
099     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
100     *   .get();}
101     * </pre>
102     *
103     * @see #get()
104     * @since 2.12.0
105     */
106    // @formatter:on
107    public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
108
109        private CharsetEncoder charsetEncoder = newEncoder(getCharset());
110
111        /**
112         * Constructs a new builder of {@link ReaderInputStream}.
113         */
114        public Builder() {
115            // empty
116        }
117
118        /**
119         * Builds a new {@link ReaderInputStream}.
120         *
121         * <p>
122         * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception.
123         * </p>
124         * <p>
125         * This builder uses the following aspects:
126         * </p>
127         * <ul>
128         * <li>{@link #getReader()} gets the target aspect.</li>
129         * <li>{@link #getBufferSize()}</li>
130         * <li>{@link #getCharset()}</li>
131         * <li>{@link CharsetEncoder}</li>
132         * </ul>
133         *
134         * @return a new instance.
135         * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}.
136         * @throws IllegalStateException         if the {@code origin} is {@code null}.
137         * @throws IOException                   if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}.
138         * @see #getReader()
139         * @see CharsetEncoder
140         * @see #getBufferSize()
141         * @see #getUnchecked()
142         */
143        @Override
144        public ReaderInputStream get() throws IOException {
145            return new ReaderInputStream(this);
146        }
147
148        CharsetEncoder getCharsetEncoder() {
149            return charsetEncoder;
150        }
151
152        @Override
153        public Builder setCharset(final Charset charset) {
154            super.setCharset(charset);
155            charsetEncoder = newEncoder(getCharset());
156            return this;
157        }
158
159        /**
160         * Sets the charset encoder. Assumes that the caller has configured the encoder.
161         *
162         * @param newEncoder the charset encoder, null resets to a default encoder.
163         * @return {@code this} instance.
164         */
165        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
166            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
167            super.setCharset(charsetEncoder.charset());
168            return this;
169        }
170
171    }
172
173    /**
174     * Constructs a new {@link Builder}.
175     *
176     * @return a new {@link Builder}.
177     * @since 2.12.0
178     */
179    public static Builder builder() {
180        return new Builder();
181    }
182
183    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
184        final float minRequired = minBufferSize(charsetEncoder);
185        if (bufferSize < minRequired) {
186            throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
187                    charsetEncoder.charset().displayName()));
188        }
189        return bufferSize;
190    }
191
192    static float minBufferSize(final CharsetEncoder charsetEncoder) {
193        return charsetEncoder.maxBytesPerChar() * 2;
194    }
195
196    private static CharsetEncoder newEncoder(final Charset charset) {
197        // @formatter:off
198        return Charsets.toCharset(charset).newEncoder()
199                .onMalformedInput(CodingErrorAction.REPLACE)
200                .onUnmappableCharacter(CodingErrorAction.REPLACE);
201        // @formatter:on
202    }
203
204    private final Reader reader;
205
206    private final CharsetEncoder charsetEncoder;
207
208    /**
209     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
210     */
211    private final CharBuffer encoderIn;
212    /**
213     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
214     * caller.
215     */
216    private final ByteBuffer encoderOut;
217
218    private CoderResult lastCoderResult;
219
220    private boolean endOfInput;
221
222    @SuppressWarnings("resource") // caller closes.
223    private ReaderInputStream(final Builder builder) throws IOException {
224        this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize());
225    }
226
227    /**
228     * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@linkplain Charset#defaultCharset() default charset} with a default input
229     * buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
230     *
231     * @param reader the target {@link Reader}
232     * @deprecated Use {@link ReaderInputStream#builder()} instead
233     */
234    @Deprecated
235    public ReaderInputStream(final Reader reader) {
236        this(reader, Charset.defaultCharset());
237    }
238
239    /**
240     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
241     *
242     * <p>
243     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
244     * </p>
245     *
246     * @param reader  the target {@link Reader}
247     * @param charset the charset encoding
248     * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
249     */
250    @Deprecated
251    public ReaderInputStream(final Reader reader, final Charset charset) {
252        this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
253    }
254
255    /**
256     * Constructs a new {@link ReaderInputStream}.
257     *
258     * <p>
259     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
260     * </p>
261     *
262     * @param reader     the target {@link Reader}.
263     * @param charset    the charset encoding.
264     * @param bufferSize the size of the input buffer in number of characters.
265     * @deprecated Use {@link ReaderInputStream#builder()} instead
266     */
267    @Deprecated
268    public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
269        // @formatter:off
270        this(reader,
271            Charsets.toCharset(charset).newEncoder()
272                    .onMalformedInput(CodingErrorAction.REPLACE)
273                    .onUnmappableCharacter(CodingErrorAction.REPLACE),
274             bufferSize);
275        // @formatter:on
276    }
277
278    /**
279     * Constructs a new {@link ReaderInputStream}.
280     *
281     * <p>
282     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
283     * an encoder which had already been in use.
284     * </p>
285     *
286     * @param reader         the target {@link Reader}
287     * @param charsetEncoder the charset encoder
288     * @since 2.1
289     * @deprecated Use {@link ReaderInputStream#builder()} instead
290     */
291    @Deprecated
292    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
293        this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
294    }
295
296    /**
297     * Constructs a new {@link ReaderInputStream}.
298     *
299     * <p>
300     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
301     * an encoder which had already been in use.
302     * </p>
303     *
304     * @param reader         the target {@link Reader}
305     * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
306     * @param bufferSize     the size of the input buffer in number of characters
307     * @since 2.1
308     * @deprecated Use {@link ReaderInputStream#builder()} instead
309     */
310    @Deprecated
311    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
312        this.reader = reader;
313        this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
314        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
315        this.encoderIn.flip();
316        this.encoderOut = ByteBuffer.allocate(128);
317        this.encoderOut.flip();
318    }
319
320    /**
321     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
322     *
323     * <p>
324     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
325     * </p>
326     *
327     * @param reader      the target {@link Reader}
328     * @param charsetName the name of the charset encoding
329     * @deprecated Use {@link ReaderInputStream#builder()} instead
330     */
331    @Deprecated
332    public ReaderInputStream(final Reader reader, final String charsetName) {
333        this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
334    }
335
336    /**
337     * Constructs a new {@link ReaderInputStream}.
338     *
339     * <p>
340     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
341     * </p>
342     *
343     * @param reader      the target {@link Reader}
344     * @param charsetName the name of the charset encoding, null maps to the default Charset.
345     * @param bufferSize  the size of the input buffer in number of characters
346     * @deprecated Use {@link ReaderInputStream#builder()} instead
347     */
348    @Deprecated
349    public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
350        this(reader, Charsets.toCharset(charsetName), bufferSize);
351    }
352
353    @Override
354    public int available() throws IOException {
355        if (encoderOut.hasRemaining()) {
356            return encoderOut.remaining();
357        }
358        return 0;
359    }
360
361    /**
362     * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
363     *
364     * @throws IOException if an I/O error occurs.
365     */
366    @Override
367    public void close() throws IOException {
368        reader.close();
369        super.close();
370    }
371
372    /**
373     * Fills the internal char buffer from the reader.
374     *
375     * @throws IOException If an I/O error occurs
376     */
377    private void fillBuffer() throws IOException {
378        if (endOfInput) {
379            return;
380        }
381        if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
382            encoderIn.compact();
383            final int position = encoderIn.position();
384            // We don't use Reader#read(CharBuffer) here because it is more efficient
385            // to write directly to the underlying char array (the default implementation
386            // copies data to a temporary char array).
387            final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
388            if (c == EOF) {
389                endOfInput = true;
390            } else {
391                encoderIn.position(position + c);
392            }
393            encoderIn.flip();
394        }
395        encoderOut.compact();
396        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
397        if (endOfInput) {
398            lastCoderResult = charsetEncoder.flush(encoderOut);
399        }
400        if (lastCoderResult.isError()) {
401            lastCoderResult.throwException();
402        }
403        encoderOut.flip();
404    }
405
406    /**
407     * Gets the CharsetEncoder.
408     *
409     * @return the CharsetEncoder.
410     */
411    CharsetEncoder getCharsetEncoder() {
412        return charsetEncoder;
413    }
414
415    /**
416     * Reads a single byte.
417     *
418     * @return either the byte read or {@code -1} if the end of the stream has been reached
419     * @throws IOException if an I/O error occurs.
420     */
421    @Override
422    public int read() throws IOException {
423        checkOpen();
424        for (;;) {
425            if (encoderOut.hasRemaining()) {
426                return encoderOut.get() & 0xFF;
427            }
428            fillBuffer();
429            if (endOfInput && !encoderOut.hasRemaining()) {
430                return EOF;
431            }
432        }
433    }
434
435    /**
436     * Reads the specified number of bytes into an array.
437     *
438     * @param b the byte array to read into, must not be {@code null}
439     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
440     * @throws NullPointerException if the byte array is {@code null}.
441     * @throws IOException if an I/O error occurs.
442     */
443    @Override
444    public int read(final byte[] b) throws IOException {
445        return read(b, 0, b.length);
446    }
447
448    /**
449     * Reads the specified number of bytes into an array.
450     *
451     * @param array the byte array to read into
452     * @param off   the offset to start reading bytes into
453     * @param len   the number of bytes to read
454     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
455     * @throws NullPointerException      if the byte array is {@code null}.
456     * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code array.length}.
457     * @throws IOException if an I/O error occurs.
458     */
459    @Override
460    public int read(final byte[] array, int off, int len) throws IOException {
461        IOUtils.checkFromIndexSize(array, off, len);
462        if (len == 0) {
463            return 0; // Always return 0 if len == 0
464        }
465        int read = 0;
466        while (len > 0) {
467            if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
468                final int c = Math.min(encoderOut.remaining(), len);
469                encoderOut.get(array, off, c);
470                off += c;
471                len -= c;
472                read += c;
473            } else if (endOfInput) { // Already reach EOF in the last read
474                break;
475            } else { // Read again
476                fillBuffer();
477            }
478        }
479        return read == 0 && endOfInput ? EOF : read;
480    }
481}