View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.BufferedReader;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.OutputStreamWriter;
26  import java.io.Reader;
27  import java.nio.ByteBuffer;
28  import java.nio.CharBuffer;
29  import java.nio.charset.Charset;
30  import java.nio.charset.CharsetEncoder;
31  import java.nio.charset.CoderResult;
32  import java.nio.charset.CodingErrorAction;
33  
34  import org.apache.commons.io.Charsets;
35  import org.apache.commons.io.IOUtils;
36  import org.apache.commons.io.build.AbstractStreamBuilder;
37  import org.apache.commons.io.charset.CharsetEncoders;
38  
39  /**
40   * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
41   * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
42   * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
43   * <p>
44   * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
45   * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
46   * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
47   * {@link BufferedReader}.
48   * </p>
49   * <p>
50   * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
51   * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
52   * </p>
53   * <p>
54   * To build an instance, use {@link Builder}.
55   * </p>
56   * <pre>
57   * InputStream inputStream = ...
58   * Charset cs = ...
59   * InputStreamReader reader = new InputStreamReader(inputStream, cs);
60   * ReaderInputStream in2 = ReaderInputStream.builder()
61   *   .setReader(reader)
62   *   .setCharset(cs)
63   *   .get();
64   * </pre>
65   * <p>
66   * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
67   * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
68   * pulls it from the underlying stream.
69   * </p>
70   * <p>
71   * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
72   * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
73   * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
74   * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
75   * </p>
76   * <p>
77   * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
78   * </p>
79   * <p>
80   * Instances of {@link ReaderInputStream} are not thread safe.
81   * </p>
82   *
83   * @see Builder
84   * @see org.apache.commons.io.output.WriterOutputStream
85   * @since 2.0
86   */
87  public class ReaderInputStream extends AbstractInputStream {
88  
89      // @formatter:off
90      /**
91       * Builds a new {@link ReaderInputStream}.
92       *
93       * <p>
94       * For example:
95       * </p>
96       * <pre>{@code
97       * ReaderInputStream s = ReaderInputStream.builder()
98       *   .setPath(path)
99       *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
100      *   .get();}
101      * </pre>
102      *
103      * @see #get()
104      * @since 2.12.0
105      */
106     // @formatter:on
107     public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
108 
109         private CharsetEncoder charsetEncoder = newEncoder(getCharset());
110 
111         /**
112          * Constructs a new builder of {@link ReaderInputStream}.
113          */
114         public Builder() {
115             // empty
116         }
117 
118         /**
119          * Builds a new {@link ReaderInputStream}.
120          *
121          * <p>
122          * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception.
123          * </p>
124          * <p>
125          * This builder uses the following aspects:
126          * </p>
127          * <ul>
128          * <li>{@link #getReader()} gets the target aspect.</li>
129          * <li>{@link #getBufferSize()}</li>
130          * <li>{@link #getCharset()}</li>
131          * <li>{@link CharsetEncoder}</li>
132          * </ul>
133          *
134          * @return a new instance.
135          * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}.
136          * @throws IllegalStateException         if the {@code origin} is {@code null}.
137          * @throws IOException                   if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}.
138          * @see #getReader()
139          * @see CharsetEncoder
140          * @see #getBufferSize()
141          * @see #getUnchecked()
142          */
143         @Override
144         public ReaderInputStream get() throws IOException {
145             return new ReaderInputStream(this);
146         }
147 
148         CharsetEncoder getCharsetEncoder() {
149             return charsetEncoder;
150         }
151 
152         @Override
153         public Builder setCharset(final Charset charset) {
154             super.setCharset(charset);
155             charsetEncoder = newEncoder(getCharset());
156             return this;
157         }
158 
159         /**
160          * Sets the charset encoder. Assumes that the caller has configured the encoder.
161          *
162          * @param newEncoder the charset encoder, null resets to a default encoder.
163          * @return {@code this} instance.
164          */
165         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
166             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
167             super.setCharset(charsetEncoder.charset());
168             return this;
169         }
170 
171     }
172 
173     /**
174      * Constructs a new {@link Builder}.
175      *
176      * @return a new {@link Builder}.
177      * @since 2.12.0
178      */
179     public static Builder builder() {
180         return new Builder();
181     }
182 
183     static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
184         final float minRequired = minBufferSize(charsetEncoder);
185         if (bufferSize < minRequired) {
186             throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
187                     charsetEncoder.charset().displayName()));
188         }
189         return bufferSize;
190     }
191 
192     static float minBufferSize(final CharsetEncoder charsetEncoder) {
193         return charsetEncoder.maxBytesPerChar() * 2;
194     }
195 
196     private static CharsetEncoder newEncoder(final Charset charset) {
197         // @formatter:off
198         return Charsets.toCharset(charset).newEncoder()
199                 .onMalformedInput(CodingErrorAction.REPLACE)
200                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
201         // @formatter:on
202     }
203 
204     private final Reader reader;
205 
206     private final CharsetEncoder charsetEncoder;
207 
208     /**
209      * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
210      */
211     private final CharBuffer encoderIn;
212     /**
213      * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
214      * caller.
215      */
216     private final ByteBuffer encoderOut;
217 
218     private CoderResult lastCoderResult;
219 
220     private boolean endOfInput;
221 
222     @SuppressWarnings("resource") // caller closes.
223     private ReaderInputStream(final Builder builder) throws IOException {
224         this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize());
225     }
226 
227     /**
228      * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@linkplain Charset#defaultCharset() default charset} with a default input
229      * buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
230      *
231      * @param reader the target {@link Reader}
232      * @deprecated Use {@link ReaderInputStream#builder()} instead
233      */
234     @Deprecated
235     public ReaderInputStream(final Reader reader) {
236         this(reader, Charset.defaultCharset());
237     }
238 
239     /**
240      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
241      *
242      * <p>
243      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
244      * </p>
245      *
246      * @param reader  the target {@link Reader}
247      * @param charset the charset encoding
248      * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
249      */
250     @Deprecated
251     public ReaderInputStream(final Reader reader, final Charset charset) {
252         this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
253     }
254 
255     /**
256      * Constructs a new {@link ReaderInputStream}.
257      *
258      * <p>
259      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
260      * </p>
261      *
262      * @param reader     the target {@link Reader}.
263      * @param charset    the charset encoding.
264      * @param bufferSize the size of the input buffer in number of characters.
265      * @deprecated Use {@link ReaderInputStream#builder()} instead
266      */
267     @Deprecated
268     public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
269         // @formatter:off
270         this(reader,
271             Charsets.toCharset(charset).newEncoder()
272                     .onMalformedInput(CodingErrorAction.REPLACE)
273                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
274              bufferSize);
275         // @formatter:on
276     }
277 
278     /**
279      * Constructs a new {@link ReaderInputStream}.
280      *
281      * <p>
282      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
283      * an encoder which had already been in use.
284      * </p>
285      *
286      * @param reader         the target {@link Reader}
287      * @param charsetEncoder the charset encoder
288      * @since 2.1
289      * @deprecated Use {@link ReaderInputStream#builder()} instead
290      */
291     @Deprecated
292     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
293         this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
294     }
295 
296     /**
297      * Constructs a new {@link ReaderInputStream}.
298      *
299      * <p>
300      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
301      * an encoder which had already been in use.
302      * </p>
303      *
304      * @param reader         the target {@link Reader}
305      * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
306      * @param bufferSize     the size of the input buffer in number of characters
307      * @since 2.1
308      * @deprecated Use {@link ReaderInputStream#builder()} instead
309      */
310     @Deprecated
311     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
312         this.reader = reader;
313         this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
314         this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
315         this.encoderIn.flip();
316         this.encoderOut = ByteBuffer.allocate(128);
317         this.encoderOut.flip();
318     }
319 
320     /**
321      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
322      *
323      * <p>
324      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
325      * </p>
326      *
327      * @param reader      the target {@link Reader}
328      * @param charsetName the name of the charset encoding
329      * @deprecated Use {@link ReaderInputStream#builder()} instead
330      */
331     @Deprecated
332     public ReaderInputStream(final Reader reader, final String charsetName) {
333         this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
334     }
335 
336     /**
337      * Constructs a new {@link ReaderInputStream}.
338      *
339      * <p>
340      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
341      * </p>
342      *
343      * @param reader      the target {@link Reader}
344      * @param charsetName the name of the charset encoding, null maps to the default Charset.
345      * @param bufferSize  the size of the input buffer in number of characters
346      * @deprecated Use {@link ReaderInputStream#builder()} instead
347      */
348     @Deprecated
349     public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
350         this(reader, Charsets.toCharset(charsetName), bufferSize);
351     }
352 
353     @Override
354     public int available() throws IOException {
355         if (encoderOut.hasRemaining()) {
356             return encoderOut.remaining();
357         }
358         return 0;
359     }
360 
361     /**
362      * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
363      *
364      * @throws IOException if an I/O error occurs.
365      */
366     @Override
367     public void close() throws IOException {
368         reader.close();
369         super.close();
370     }
371 
372     /**
373      * Fills the internal char buffer from the reader.
374      *
375      * @throws IOException If an I/O error occurs
376      */
377     private void fillBuffer() throws IOException {
378         if (endOfInput) {
379             return;
380         }
381         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
382             encoderIn.compact();
383             final int position = encoderIn.position();
384             // We don't use Reader#read(CharBuffer) here because it is more efficient
385             // to write directly to the underlying char array (the default implementation
386             // copies data to a temporary char array).
387             final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
388             if (c == EOF) {
389                 endOfInput = true;
390             } else {
391                 encoderIn.position(position + c);
392             }
393             encoderIn.flip();
394         }
395         encoderOut.compact();
396         lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
397         if (endOfInput) {
398             lastCoderResult = charsetEncoder.flush(encoderOut);
399         }
400         if (lastCoderResult.isError()) {
401             lastCoderResult.throwException();
402         }
403         encoderOut.flip();
404     }
405 
406     /**
407      * Gets the CharsetEncoder.
408      *
409      * @return the CharsetEncoder.
410      */
411     CharsetEncoder getCharsetEncoder() {
412         return charsetEncoder;
413     }
414 
415     /**
416      * Reads a single byte.
417      *
418      * @return either the byte read or {@code -1} if the end of the stream has been reached
419      * @throws IOException if an I/O error occurs.
420      */
421     @Override
422     public int read() throws IOException {
423         checkOpen();
424         for (;;) {
425             if (encoderOut.hasRemaining()) {
426                 return encoderOut.get() & 0xFF;
427             }
428             fillBuffer();
429             if (endOfInput && !encoderOut.hasRemaining()) {
430                 return EOF;
431             }
432         }
433     }
434 
435     /**
436      * Reads the specified number of bytes into an array.
437      *
438      * @param b the byte array to read into, must not be {@code null}
439      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
440      * @throws NullPointerException if the byte array is {@code null}.
441      * @throws IOException if an I/O error occurs.
442      */
443     @Override
444     public int read(final byte[] b) throws IOException {
445         return read(b, 0, b.length);
446     }
447 
448     /**
449      * Reads the specified number of bytes into an array.
450      *
451      * @param array the byte array to read into
452      * @param off   the offset to start reading bytes into
453      * @param len   the number of bytes to read
454      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
455      * @throws NullPointerException      if the byte array is {@code null}.
456      * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code array.length}.
457      * @throws IOException if an I/O error occurs.
458      */
459     @Override
460     public int read(final byte[] array, int off, int len) throws IOException {
461         IOUtils.checkFromIndexSize(array, off, len);
462         if (len == 0) {
463             return 0; // Always return 0 if len == 0
464         }
465         int read = 0;
466         while (len > 0) {
467             if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
468                 final int c = Math.min(encoderOut.remaining(), len);
469                 encoderOut.get(array, off, c);
470                 off += c;
471                 len -= c;
472                 read += c;
473             } else if (endOfInput) { // Already reach EOF in the last read
474                 break;
475             } else { // Read again
476                 fillBuffer();
477             }
478         }
479         return read == 0 && endOfInput ? EOF : read;
480     }
481 }