View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.apache.commons.io.IOUtils.EOF;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.Reader;
24  import java.nio.ByteBuffer;
25  import java.nio.CharBuffer;
26  import java.nio.charset.Charset;
27  import java.nio.charset.CharsetEncoder;
28  import java.nio.charset.CoderResult;
29  import java.nio.charset.CodingErrorAction;
30  import java.util.Objects;
31  
32  import org.apache.commons.io.Charsets;
33  import org.apache.commons.io.IOUtils;
34  import org.apache.commons.io.build.AbstractStreamBuilder;
35  import org.apache.commons.io.charset.CharsetEncoders;
36  
37  /**
38   * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
39   * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
40   * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
41   * <p>
42   * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
43   * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
44   * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
45   * {@link java.io.BufferedReader}.
46   * </p>
47   * <p>
48   * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2}
49   * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
50   * </p>
51   * <p>
52   * To build an instance, use {@link Builder}.
53   * </p>
54   * <pre>
55   * InputStream inputStream = ...
56   * Charset cs = ...
57   * InputStreamReader reader = new InputStreamReader(inputStream, cs);
58   * ReaderInputStream in2 = ReaderInputStream.builder()
59   *   .setReader(reader)
60   *   .setCharset(cs)
61   *   .get();
62   * </pre>
63   * <p>
64   * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes
65   * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
66   * pulls it from the underlying stream.
67   * </p>
68   * <p>
69   * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
70   * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
71   * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
72   * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
73   * </p>
74   * <p>
75   * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
76   * </p>
77   * <p>
78   * Instances of {@link ReaderInputStream} are not thread safe.
79   * </p>
80   *
81   * @see Builder
82   * @see org.apache.commons.io.output.WriterOutputStream
83   * @since 2.0
84   */
85  public class ReaderInputStream extends InputStream {
86  
87      // @formatter:off
88      /**
89       * Builds a new {@link ReaderInputStream}.
90       *
91       * <p>
92       * For example:
93       * </p>
94       * <pre>{@code
95       * ReaderInputStream s = ReaderInputStream.builder()
96       *   .setPath(path)
97       *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
98       *   .get();}
99       * </pre>
100      *
101      * @see #get()
102      * @since 2.12.0
103      */
104     // @formatter:on
105     public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
106 
107         private CharsetEncoder charsetEncoder = newEncoder(getCharset());
108 
109         /**
110          * Builds a new {@link ReaderInputStream}.
111          *
112          * <p>
113          * You must set input that supports {@link #getReader()}, otherwise, this method throws an exception.
114          * </p>
115          * <p>
116          * This builder use the following aspects:
117          * </p>
118          * <ul>
119          * <li>{@link #getReader()}</li>
120          * <li>{@link #getBufferSize()}</li>
121          * <li>{@link #getCharset()}</li>
122          * <li>{@link CharsetEncoder}</li>
123          * </ul>
124          *
125          * @return a new instance.
126          * @throws UnsupportedOperationException if the origin cannot provide a Reader.
127          * @throws IllegalStateException if the {@code origin} is {@code null}.
128          * @see #getReader()
129          * @see CharsetEncoder
130          * @see #getBufferSize()
131          */
132         @SuppressWarnings("resource")
133         @Override
134         public ReaderInputStream get() throws IOException {
135             return new ReaderInputStream(getReader(), charsetEncoder, getBufferSize());
136         }
137 
138         CharsetEncoder getCharsetEncoder() {
139             return charsetEncoder;
140         }
141 
142         @Override
143         public Builder setCharset(final Charset charset) {
144             super.setCharset(charset);
145             charsetEncoder = newEncoder(getCharset());
146             return this;
147         }
148 
149         /**
150          * Sets the charset encoder. Assumes that the caller has configured the encoder.
151          *
152          * @param newEncoder the charset encoder, null resets to a default encoder.
153          * @return this
154          */
155         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
156             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
157             super.setCharset(charsetEncoder.charset());
158             return this;
159         }
160 
161     }
162 
163     /**
164      * Constructs a new {@link Builder}.
165      *
166      * @return a new {@link Builder}.
167      * @since 2.12.0
168      */
169     public static Builder builder() {
170         return new Builder();
171     }
172 
173     static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
174         final float minRequired = minBufferSize(charsetEncoder);
175         if (bufferSize < minRequired) {
176             throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
177                     charsetEncoder.charset().displayName()));
178         }
179         return bufferSize;
180     }
181 
182     static float minBufferSize(final CharsetEncoder charsetEncoder) {
183         return charsetEncoder.maxBytesPerChar() * 2;
184     }
185 
186     private static CharsetEncoder newEncoder(final Charset charset) {
187         // @formatter:off
188         return Charsets.toCharset(charset).newEncoder()
189                 .onMalformedInput(CodingErrorAction.REPLACE)
190                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
191         // @formatter:on
192     }
193 
194     private final Reader reader;
195 
196     private final CharsetEncoder charsetEncoder;
197 
198     /**
199      * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
200      */
201     private final CharBuffer encoderIn;
202     /**
203      * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
204      * caller.
205      */
206     private final ByteBuffer encoderOut;
207 
208     private CoderResult lastCoderResult;
209 
210     private boolean endOfInput;
211 
212     /**
213      * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of
214      * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
215      *
216      * @param reader the target {@link Reader}
217      * @deprecated Use {@link ReaderInputStream#builder()} instead
218      */
219     @Deprecated
220     public ReaderInputStream(final Reader reader) {
221         this(reader, Charset.defaultCharset());
222     }
223 
224     /**
225      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
226      *
227      * <p>
228      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
229      * </p>
230      *
231      * @param reader  the target {@link Reader}
232      * @param charset the charset encoding
233      * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
234      */
235     @Deprecated
236     public ReaderInputStream(final Reader reader, final Charset charset) {
237         this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
238     }
239 
240     /**
241      * Constructs a new {@link ReaderInputStream}.
242      *
243      * <p>
244      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
245      * </p>
246      *
247      * @param reader     the target {@link Reader}.
248      * @param charset    the charset encoding.
249      * @param bufferSize the size of the input buffer in number of characters.
250      * @deprecated Use {@link ReaderInputStream#builder()} instead
251      */
252     @Deprecated
253     public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
254         // @formatter:off
255         this(reader,
256             Charsets.toCharset(charset).newEncoder()
257                     .onMalformedInput(CodingErrorAction.REPLACE)
258                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
259              bufferSize);
260         // @formatter:on
261     }
262 
263     /**
264      * Constructs a new {@link ReaderInputStream}.
265      *
266      * <p>
267      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
268      * an encoder which had already been in use.
269      * </p>
270      *
271      * @param reader         the target {@link Reader}
272      * @param charsetEncoder the charset encoder
273      * @since 2.1
274      * @deprecated Use {@link ReaderInputStream#builder()} instead
275      */
276     @Deprecated
277     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
278         this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
279     }
280 
281     /**
282      * Constructs a new {@link ReaderInputStream}.
283      *
284      * <p>
285      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
286      * an encoder which had already been in use.
287      * </p>
288      *
289      * @param reader         the target {@link Reader}
290      * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
291      * @param bufferSize     the size of the input buffer in number of characters
292      * @since 2.1
293      * @deprecated Use {@link ReaderInputStream#builder()} instead
294      */
295     @Deprecated
296     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
297         this.reader = reader;
298         this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
299         this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
300         this.encoderIn.flip();
301         this.encoderOut = ByteBuffer.allocate(128);
302         this.encoderOut.flip();
303     }
304 
305     /**
306      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
307      *
308      * <p>
309      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
310      * </p>
311      *
312      * @param reader      the target {@link Reader}
313      * @param charsetName the name of the charset encoding
314      * @deprecated Use {@link ReaderInputStream#builder()} instead
315      */
316     @Deprecated
317     public ReaderInputStream(final Reader reader, final String charsetName) {
318         this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
319     }
320 
321     /**
322      * Constructs a new {@link ReaderInputStream}.
323      *
324      * <p>
325      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
326      * </p>
327      *
328      * @param reader      the target {@link Reader}
329      * @param charsetName the name of the charset encoding, null maps to the default Charset.
330      * @param bufferSize  the size of the input buffer in number of characters
331      * @deprecated Use {@link ReaderInputStream#builder()} instead
332      */
333     @Deprecated
334     public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
335         this(reader, Charsets.toCharset(charsetName), bufferSize);
336     }
337 
338     /**
339      * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
340      *
341      * @throws IOException if an I/O error occurs.
342      */
343     @Override
344     public void close() throws IOException {
345         reader.close();
346     }
347 
348     /**
349      * Fills the internal char buffer from the reader.
350      *
351      * @throws IOException If an I/O error occurs
352      */
353     private void fillBuffer() throws IOException {
354         if (endOfInput) {
355             return;
356         }
357         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
358             encoderIn.compact();
359             final int position = encoderIn.position();
360             // We don't use Reader#read(CharBuffer) here because it is more efficient
361             // to write directly to the underlying char array (the default implementation
362             // copies data to a temporary char array).
363             final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
364             if (c == EOF) {
365                 endOfInput = true;
366             } else {
367                 encoderIn.position(position + c);
368             }
369             encoderIn.flip();
370         }
371         encoderOut.compact();
372         lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
373         if (endOfInput) {
374             lastCoderResult = charsetEncoder.flush(encoderOut);
375         }
376         if (lastCoderResult.isError()) {
377             lastCoderResult.throwException();
378         }
379         encoderOut.flip();
380     }
381 
382     /**
383      * Gets the CharsetEncoder.
384      *
385      * @return the CharsetEncoder.
386      */
387     CharsetEncoder getCharsetEncoder() {
388         return charsetEncoder;
389     }
390 
391     /**
392      * Reads a single byte.
393      *
394      * @return either the byte read or {@code -1} if the end of the stream has been reached
395      * @throws IOException if an I/O error occurs.
396      */
397     @Override
398     public int read() throws IOException {
399         for (;;) {
400             if (encoderOut.hasRemaining()) {
401                 return encoderOut.get() & 0xFF;
402             }
403             fillBuffer();
404             if (endOfInput && !encoderOut.hasRemaining()) {
405                 return EOF;
406             }
407         }
408     }
409 
410     /**
411      * Reads the specified number of bytes into an array.
412      *
413      * @param b the byte array to read into
414      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
415      * @throws IOException if an I/O error occurs.
416      */
417     @Override
418     public int read(final byte[] b) throws IOException {
419         return read(b, 0, b.length);
420     }
421 
422     /**
423      * Reads the specified number of bytes into an array.
424      *
425      * @param array the byte array to read into
426      * @param off   the offset to start reading bytes into
427      * @param len   the number of bytes to read
428      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
429      * @throws IOException if an I/O error occurs.
430      */
431     @Override
432     public int read(final byte[] array, int off, int len) throws IOException {
433         Objects.requireNonNull(array, "array");
434         if (len < 0 || off < 0 || off + len > array.length) {
435             throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
436         }
437         int read = 0;
438         if (len == 0) {
439             return 0; // Always return 0 if len == 0
440         }
441         while (len > 0) {
442             if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
443                 final int c = Math.min(encoderOut.remaining(), len);
444                 encoderOut.get(array, off, c);
445                 off += c;
446                 len -= c;
447                 read += c;
448             } else if (endOfInput) { // Already reach EOF in the last read
449                 break;
450             } else { // Read again
451                 fillBuffer();
452             }
453         }
454         return read == 0 && endOfInput ? EOF : read;
455     }
456 }