View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.BufferedReader;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.OutputStreamWriter;
26  import java.io.Reader;
27  import java.nio.ByteBuffer;
28  import java.nio.CharBuffer;
29  import java.nio.charset.Charset;
30  import java.nio.charset.CharsetEncoder;
31  import java.nio.charset.CoderResult;
32  import java.nio.charset.CodingErrorAction;
33  import java.util.Objects;
34  
35  import org.apache.commons.io.Charsets;
36  import org.apache.commons.io.IOUtils;
37  import org.apache.commons.io.build.AbstractStreamBuilder;
38  import org.apache.commons.io.charset.CharsetEncoders;
39  
40  /**
41   * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
42   * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
43   * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
44   * <p>
45   * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
46   * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
47   * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
48   * {@link BufferedReader}.
49   * </p>
50   * <p>
51   * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
52   * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
53   * </p>
54   * <p>
55   * To build an instance, use {@link Builder}.
56   * </p>
57   * <pre>
58   * InputStream inputStream = ...
59   * Charset cs = ...
60   * InputStreamReader reader = new InputStreamReader(inputStream, cs);
61   * ReaderInputStream in2 = ReaderInputStream.builder()
62   *   .setReader(reader)
63   *   .setCharset(cs)
64   *   .get();
65   * </pre>
66   * <p>
67   * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
68   * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
69   * pulls it from the underlying stream.
70   * </p>
71   * <p>
72   * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
73   * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
74   * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
75   * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
76   * </p>
77   * <p>
78   * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
79   * </p>
80   * <p>
81   * Instances of {@link ReaderInputStream} are not thread safe.
82   * </p>
83   *
84   * @see Builder
85   * @see org.apache.commons.io.output.WriterOutputStream
86   * @since 2.0
87   */
88  public class ReaderInputStream extends AbstractInputStream {
89  
90      // @formatter:off
91      /**
92       * Builds a new {@link ReaderInputStream}.
93       *
94       * <p>
95       * For example:
96       * </p>
97       * <pre>{@code
98       * ReaderInputStream s = ReaderInputStream.builder()
99       *   .setPath(path)
100      *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
101      *   .get();}
102      * </pre>
103      *
104      * @see #get()
105      * @since 2.12.0
106      */
107     // @formatter:on
108     public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
109 
110         private CharsetEncoder charsetEncoder = newEncoder(getCharset());
111 
112         /**
113          * Constructs a new builder of {@link ReaderInputStream}.
114          */
115         public Builder() {
116             // empty
117         }
118 
119         /**
120          * Builds a new {@link ReaderInputStream}.
121          *
122          * <p>
123          * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception.
124          * </p>
125          * <p>
126          * This builder uses the following aspects:
127          * </p>
128          * <ul>
129          * <li>{@link #getReader()} gets the target aspect.</li>
130          * <li>{@link #getBufferSize()}</li>
131          * <li>{@link #getCharset()}</li>
132          * <li>{@link CharsetEncoder}</li>
133          * </ul>
134          *
135          * @return a new instance.
136          * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}.
137          * @throws IllegalStateException         if the {@code origin} is {@code null}.
138          * @throws IOException                   if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}.
139          * @see #getReader()
140          * @see CharsetEncoder
141          * @see #getBufferSize()
142          * @see #getUnchecked()
143          */
144         @Override
145         public ReaderInputStream get() throws IOException {
146             return new ReaderInputStream(this);
147         }
148 
149         CharsetEncoder getCharsetEncoder() {
150             return charsetEncoder;
151         }
152 
153         @Override
154         public Builder setCharset(final Charset charset) {
155             super.setCharset(charset);
156             charsetEncoder = newEncoder(getCharset());
157             return this;
158         }
159 
160         /**
161          * Sets the charset encoder. Assumes that the caller has configured the encoder.
162          *
163          * @param newEncoder the charset encoder, null resets to a default encoder.
164          * @return {@code this} instance.
165          */
166         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
167             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
168             super.setCharset(charsetEncoder.charset());
169             return this;
170         }
171 
172     }
173 
174     /**
175      * Constructs a new {@link Builder}.
176      *
177      * @return a new {@link Builder}.
178      * @since 2.12.0
179      */
180     public static Builder builder() {
181         return new Builder();
182     }
183 
184     static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
185         final float minRequired = minBufferSize(charsetEncoder);
186         if (bufferSize < minRequired) {
187             throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
188                     charsetEncoder.charset().displayName()));
189         }
190         return bufferSize;
191     }
192 
193     static float minBufferSize(final CharsetEncoder charsetEncoder) {
194         return charsetEncoder.maxBytesPerChar() * 2;
195     }
196 
197     private static CharsetEncoder newEncoder(final Charset charset) {
198         // @formatter:off
199         return Charsets.toCharset(charset).newEncoder()
200                 .onMalformedInput(CodingErrorAction.REPLACE)
201                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
202         // @formatter:on
203     }
204 
205     private final Reader reader;
206 
207     private final CharsetEncoder charsetEncoder;
208 
209     /**
210      * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
211      */
212     private final CharBuffer encoderIn;
213     /**
214      * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
215      * caller.
216      */
217     private final ByteBuffer encoderOut;
218 
219     private CoderResult lastCoderResult;
220 
221     private boolean endOfInput;
222 
223     @SuppressWarnings("resource") // caller closes.
224     private ReaderInputStream(final Builder builder) throws IOException {
225         this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize());
226     }
227 
228     /**
229      * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@link Charset#defaultCharset() default charset} with a default input buffer
230      * size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
231      *
232      * @param reader the target {@link Reader}
233      * @deprecated Use {@link ReaderInputStream#builder()} instead
234      */
235     @Deprecated
236     public ReaderInputStream(final Reader reader) {
237         this(reader, Charset.defaultCharset());
238     }
239 
240     /**
241      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
242      *
243      * <p>
244      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
245      * </p>
246      *
247      * @param reader  the target {@link Reader}
248      * @param charset the charset encoding
249      * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
250      */
251     @Deprecated
252     public ReaderInputStream(final Reader reader, final Charset charset) {
253         this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
254     }
255 
256     /**
257      * Constructs a new {@link ReaderInputStream}.
258      *
259      * <p>
260      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
261      * </p>
262      *
263      * @param reader     the target {@link Reader}.
264      * @param charset    the charset encoding.
265      * @param bufferSize the size of the input buffer in number of characters.
266      * @deprecated Use {@link ReaderInputStream#builder()} instead
267      */
268     @Deprecated
269     public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
270         // @formatter:off
271         this(reader,
272             Charsets.toCharset(charset).newEncoder()
273                     .onMalformedInput(CodingErrorAction.REPLACE)
274                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
275              bufferSize);
276         // @formatter:on
277     }
278 
279     /**
280      * Constructs a new {@link ReaderInputStream}.
281      *
282      * <p>
283      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
284      * an encoder which had already been in use.
285      * </p>
286      *
287      * @param reader         the target {@link Reader}
288      * @param charsetEncoder the charset encoder
289      * @since 2.1
290      * @deprecated Use {@link ReaderInputStream#builder()} instead
291      */
292     @Deprecated
293     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
294         this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
295     }
296 
297     /**
298      * Constructs a new {@link ReaderInputStream}.
299      *
300      * <p>
301      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
302      * an encoder which had already been in use.
303      * </p>
304      *
305      * @param reader         the target {@link Reader}
306      * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
307      * @param bufferSize     the size of the input buffer in number of characters
308      * @since 2.1
309      * @deprecated Use {@link ReaderInputStream#builder()} instead
310      */
311     @Deprecated
312     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
313         this.reader = reader;
314         this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
315         this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
316         this.encoderIn.flip();
317         this.encoderOut = ByteBuffer.allocate(128);
318         this.encoderOut.flip();
319     }
320 
321     /**
322      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
323      *
324      * <p>
325      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
326      * </p>
327      *
328      * @param reader      the target {@link Reader}
329      * @param charsetName the name of the charset encoding
330      * @deprecated Use {@link ReaderInputStream#builder()} instead
331      */
332     @Deprecated
333     public ReaderInputStream(final Reader reader, final String charsetName) {
334         this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
335     }
336 
337     /**
338      * Constructs a new {@link ReaderInputStream}.
339      *
340      * <p>
341      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
342      * </p>
343      *
344      * @param reader      the target {@link Reader}
345      * @param charsetName the name of the charset encoding, null maps to the default Charset.
346      * @param bufferSize  the size of the input buffer in number of characters
347      * @deprecated Use {@link ReaderInputStream#builder()} instead
348      */
349     @Deprecated
350     public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
351         this(reader, Charsets.toCharset(charsetName), bufferSize);
352     }
353 
354     @Override
355     public int available() throws IOException {
356         if (encoderOut.hasRemaining()) {
357             return encoderOut.remaining();
358         }
359         return 0;
360     }
361 
362     /**
363      * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
364      *
365      * @throws IOException if an I/O error occurs.
366      */
367     @Override
368     public void close() throws IOException {
369         reader.close();
370         super.close();
371     }
372 
373     /**
374      * Fills the internal char buffer from the reader.
375      *
376      * @throws IOException If an I/O error occurs
377      */
378     private void fillBuffer() throws IOException {
379         if (endOfInput) {
380             return;
381         }
382         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
383             encoderIn.compact();
384             final int position = encoderIn.position();
385             // We don't use Reader#read(CharBuffer) here because it is more efficient
386             // to write directly to the underlying char array (the default implementation
387             // copies data to a temporary char array).
388             final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
389             if (c == EOF) {
390                 endOfInput = true;
391             } else {
392                 encoderIn.position(position + c);
393             }
394             encoderIn.flip();
395         }
396         encoderOut.compact();
397         lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
398         if (endOfInput) {
399             lastCoderResult = charsetEncoder.flush(encoderOut);
400         }
401         if (lastCoderResult.isError()) {
402             lastCoderResult.throwException();
403         }
404         encoderOut.flip();
405     }
406 
407     /**
408      * Gets the CharsetEncoder.
409      *
410      * @return the CharsetEncoder.
411      */
412     CharsetEncoder getCharsetEncoder() {
413         return charsetEncoder;
414     }
415 
416     /**
417      * Reads a single byte.
418      *
419      * @return either the byte read or {@code -1} if the end of the stream has been reached
420      * @throws IOException if an I/O error occurs.
421      */
422     @Override
423     public int read() throws IOException {
424         checkOpen();
425         for (;;) {
426             if (encoderOut.hasRemaining()) {
427                 return encoderOut.get() & 0xFF;
428             }
429             fillBuffer();
430             if (endOfInput && !encoderOut.hasRemaining()) {
431                 return EOF;
432             }
433         }
434     }
435 
436     /**
437      * Reads the specified number of bytes into an array.
438      *
439      * @param b the byte array to read into
440      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
441      * @throws IOException if an I/O error occurs.
442      */
443     @Override
444     public int read(final byte[] b) throws IOException {
445         return read(b, 0, b.length);
446     }
447 
448     /**
449      * Reads the specified number of bytes into an array.
450      *
451      * @param array the byte array to read into
452      * @param off   the offset to start reading bytes into
453      * @param len   the number of bytes to read
454      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
455      * @throws IOException if an I/O error occurs.
456      */
457     @Override
458     public int read(final byte[] array, int off, int len) throws IOException {
459         Objects.requireNonNull(array, "array");
460         if (len < 0 || off < 0 || off + len > array.length) {
461             throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
462         }
463         int read = 0;
464         if (len == 0) {
465             return 0; // Always return 0 if len == 0
466         }
467         while (len > 0) {
468             if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
469                 final int c = Math.min(encoderOut.remaining(), len);
470                 encoderOut.get(array, off, c);
471                 off += c;
472                 len -= c;
473                 read += c;
474             } else if (endOfInput) { // Already reach EOF in the last read
475                 break;
476             } else { // Read again
477                 fillBuffer();
478             }
479         }
480         return read == 0 && endOfInput ? EOF : read;
481     }
482 }