001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.Reader;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.Charset;
027import java.nio.charset.CharsetEncoder;
028import java.nio.charset.CoderResult;
029import java.nio.charset.CodingErrorAction;
030import java.util.Objects;
031
032import org.apache.commons.io.Charsets;
033import org.apache.commons.io.IOUtils;
034import org.apache.commons.io.build.AbstractStreamBuilder;
035import org.apache.commons.io.charset.CharsetEncoders;
036
037/**
038 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
039 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
040 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
041 * <p>
042 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
043 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
044 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
045 * {@link java.io.BufferedReader}.
046 * </p>
047 * <p>
048 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2}
049 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
050 * </p>
051 * <p>
052 * To build an instance, use {@link Builder}.
053 * </p>
054 * <pre>
055 * InputStream inputStream = ...
056 * Charset cs = ...
057 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
058 * ReaderInputStream in2 = ReaderInputStream.builder()
059 *   .setReader(reader)
060 *   .setCharset(cs)
061 *   .get();
062 * </pre>
063 * <p>
064 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes
065 * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
066 * pulls it from the underlying stream.
067 * </p>
068 * <p>
069 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
070 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
071 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
072 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
073 * </p>
074 * <p>
075 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
076 * </p>
077 * <p>
078 * Instances of {@link ReaderInputStream} are not thread safe.
079 * </p>
080 *
081 * @see Builder
082 * @see org.apache.commons.io.output.WriterOutputStream
083 * @since 2.0
084 */
085public class ReaderInputStream extends InputStream {
086
087    // @formatter:off
088    /**
089     * Builds a new {@link ReaderInputStream}.
090     *
091     * <p>
092     * For example:
093     * </p>
094     * <pre>{@code
095     * ReaderInputStream s = ReaderInputStream.builder()
096     *   .setPath(path)
097     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
098     *   .get();}
099     * </pre>
100     *
101     * @see #get()
102     * @since 2.12.0
103     */
104    // @formatter:on
105    public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
106
107        private CharsetEncoder charsetEncoder = newEncoder(getCharset());
108
109        /**
110         * Builds a new {@link ReaderInputStream}.
111         *
112         * <p>
113         * You must set input that supports {@link #getReader()}, otherwise, this method throws an exception.
114         * </p>
115         * <p>
116         * This builder use the following aspects:
117         * </p>
118         * <ul>
119         * <li>{@link #getReader()}</li>
120         * <li>{@link #getBufferSize()}</li>
121         * <li>{@link #getCharset()}</li>
122         * <li>{@link CharsetEncoder}</li>
123         * </ul>
124         *
125         * @return a new instance.
126         * @throws UnsupportedOperationException if the origin cannot provide a Reader.
127         * @throws IllegalStateException if the {@code origin} is {@code null}.
128         * @see #getReader()
129         * @see CharsetEncoder
130         * @see #getBufferSize()
131         */
132        @SuppressWarnings("resource")
133        @Override
134        public ReaderInputStream get() throws IOException {
135            return new ReaderInputStream(getReader(), charsetEncoder, getBufferSize());
136        }
137
138        CharsetEncoder getCharsetEncoder() {
139            return charsetEncoder;
140        }
141
142        @Override
143        public Builder setCharset(final Charset charset) {
144            super.setCharset(charset);
145            charsetEncoder = newEncoder(getCharset());
146            return this;
147        }
148
149        /**
150         * Sets the charset encoder. Assumes that the caller has configured the encoder.
151         *
152         * @param newEncoder the charset encoder, null resets to a default encoder.
153         * @return this
154         */
155        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
156            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
157            super.setCharset(charsetEncoder.charset());
158            return this;
159        }
160
161    }
162
163    /**
164     * Constructs a new {@link Builder}.
165     *
166     * @return a new {@link Builder}.
167     * @since 2.12.0
168     */
169    public static Builder builder() {
170        return new Builder();
171    }
172
173    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
174        final float minRequired = minBufferSize(charsetEncoder);
175        if (bufferSize < minRequired) {
176            throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
177                    charsetEncoder.charset().displayName()));
178        }
179        return bufferSize;
180    }
181
182    static float minBufferSize(final CharsetEncoder charsetEncoder) {
183        return charsetEncoder.maxBytesPerChar() * 2;
184    }
185
186    private static CharsetEncoder newEncoder(final Charset charset) {
187        // @formatter:off
188        return Charsets.toCharset(charset).newEncoder()
189                .onMalformedInput(CodingErrorAction.REPLACE)
190                .onUnmappableCharacter(CodingErrorAction.REPLACE);
191        // @formatter:on
192    }
193
194    private final Reader reader;
195
196    private final CharsetEncoder charsetEncoder;
197
198    /**
199     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
200     */
201    private final CharBuffer encoderIn;
202    /**
203     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
204     * caller.
205     */
206    private final ByteBuffer encoderOut;
207
208    private CoderResult lastCoderResult;
209
210    private boolean endOfInput;
211
212    /**
213     * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of
214     * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
215     *
216     * @param reader the target {@link Reader}
217     * @deprecated Use {@link ReaderInputStream#builder()} instead
218     */
219    @Deprecated
220    public ReaderInputStream(final Reader reader) {
221        this(reader, Charset.defaultCharset());
222    }
223
224    /**
225     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
226     *
227     * <p>
228     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
229     * </p>
230     *
231     * @param reader  the target {@link Reader}
232     * @param charset the charset encoding
233     * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
234     */
235    @Deprecated
236    public ReaderInputStream(final Reader reader, final Charset charset) {
237        this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
238    }
239
240    /**
241     * Constructs a new {@link ReaderInputStream}.
242     *
243     * <p>
244     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
245     * </p>
246     *
247     * @param reader     the target {@link Reader}.
248     * @param charset    the charset encoding.
249     * @param bufferSize the size of the input buffer in number of characters.
250     * @deprecated Use {@link ReaderInputStream#builder()} instead
251     */
252    @Deprecated
253    public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
254        // @formatter:off
255        this(reader,
256            Charsets.toCharset(charset).newEncoder()
257                    .onMalformedInput(CodingErrorAction.REPLACE)
258                    .onUnmappableCharacter(CodingErrorAction.REPLACE),
259             bufferSize);
260        // @formatter:on
261    }
262
263    /**
264     * Constructs a new {@link ReaderInputStream}.
265     *
266     * <p>
267     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
268     * an encoder which had already been in use.
269     * </p>
270     *
271     * @param reader         the target {@link Reader}
272     * @param charsetEncoder the charset encoder
273     * @since 2.1
274     * @deprecated Use {@link ReaderInputStream#builder()} instead
275     */
276    @Deprecated
277    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
278        this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
279    }
280
281    /**
282     * Constructs a new {@link ReaderInputStream}.
283     *
284     * <p>
285     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
286     * an encoder which had already been in use.
287     * </p>
288     *
289     * @param reader         the target {@link Reader}
290     * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
291     * @param bufferSize     the size of the input buffer in number of characters
292     * @since 2.1
293     * @deprecated Use {@link ReaderInputStream#builder()} instead
294     */
295    @Deprecated
296    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
297        this.reader = reader;
298        this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
299        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
300        this.encoderIn.flip();
301        this.encoderOut = ByteBuffer.allocate(128);
302        this.encoderOut.flip();
303    }
304
305    /**
306     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
307     *
308     * <p>
309     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
310     * </p>
311     *
312     * @param reader      the target {@link Reader}
313     * @param charsetName the name of the charset encoding
314     * @deprecated Use {@link ReaderInputStream#builder()} instead
315     */
316    @Deprecated
317    public ReaderInputStream(final Reader reader, final String charsetName) {
318        this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
319    }
320
321    /**
322     * Constructs a new {@link ReaderInputStream}.
323     *
324     * <p>
325     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
326     * </p>
327     *
328     * @param reader      the target {@link Reader}
329     * @param charsetName the name of the charset encoding, null maps to the default Charset.
330     * @param bufferSize  the size of the input buffer in number of characters
331     * @deprecated Use {@link ReaderInputStream#builder()} instead
332     */
333    @Deprecated
334    public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
335        this(reader, Charsets.toCharset(charsetName), bufferSize);
336    }
337
338    /**
339     * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
340     *
341     * @throws IOException if an I/O error occurs.
342     */
343    @Override
344    public void close() throws IOException {
345        reader.close();
346    }
347
348    /**
349     * Fills the internal char buffer from the reader.
350     *
351     * @throws IOException If an I/O error occurs
352     */
353    private void fillBuffer() throws IOException {
354        if (endOfInput) {
355            return;
356        }
357        if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
358            encoderIn.compact();
359            final int position = encoderIn.position();
360            // We don't use Reader#read(CharBuffer) here because it is more efficient
361            // to write directly to the underlying char array (the default implementation
362            // copies data to a temporary char array).
363            final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
364            if (c == EOF) {
365                endOfInput = true;
366            } else {
367                encoderIn.position(position + c);
368            }
369            encoderIn.flip();
370        }
371        encoderOut.compact();
372        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
373        if (endOfInput) {
374            lastCoderResult = charsetEncoder.flush(encoderOut);
375        }
376        if (lastCoderResult.isError()) {
377            lastCoderResult.throwException();
378        }
379        encoderOut.flip();
380    }
381
382    /**
383     * Gets the CharsetEncoder.
384     *
385     * @return the CharsetEncoder.
386     */
387    CharsetEncoder getCharsetEncoder() {
388        return charsetEncoder;
389    }
390
391    /**
392     * Reads a single byte.
393     *
394     * @return either the byte read or {@code -1} if the end of the stream has been reached
395     * @throws IOException if an I/O error occurs.
396     */
397    @Override
398    public int read() throws IOException {
399        for (;;) {
400            if (encoderOut.hasRemaining()) {
401                return encoderOut.get() & 0xFF;
402            }
403            fillBuffer();
404            if (endOfInput && !encoderOut.hasRemaining()) {
405                return EOF;
406            }
407        }
408    }
409
410    /**
411     * Reads the specified number of bytes into an array.
412     *
413     * @param b the byte array to read into
414     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
415     * @throws IOException if an I/O error occurs.
416     */
417    @Override
418    public int read(final byte[] b) throws IOException {
419        return read(b, 0, b.length);
420    }
421
422    /**
423     * Reads the specified number of bytes into an array.
424     *
425     * @param array the byte array to read into
426     * @param off   the offset to start reading bytes into
427     * @param len   the number of bytes to read
428     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
429     * @throws IOException if an I/O error occurs.
430     */
431    @Override
432    public int read(final byte[] array, int off, int len) throws IOException {
433        Objects.requireNonNull(array, "array");
434        if (len < 0 || off < 0 || off + len > array.length) {
435            throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
436        }
437        int read = 0;
438        if (len == 0) {
439            return 0; // Always return 0 if len == 0
440        }
441        while (len > 0) {
442            if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
443                final int c = Math.min(encoderOut.remaining(), len);
444                encoderOut.get(array, off, c);
445                off += c;
446                len -= c;
447                read += c;
448            } else if (endOfInput) { // Already reach EOF in the last read
449                break;
450            } else { // Read again
451                fillBuffer();
452            }
453        }
454        return read == 0 && endOfInput ? EOF : read;
455    }
456}