1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.io.input;
18
19 import static org.apache.commons.io.IOUtils.EOF;
20
21 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.OutputStreamWriter;
26 import java.io.Reader;
27 import java.nio.ByteBuffer;
28 import java.nio.CharBuffer;
29 import java.nio.charset.Charset;
30 import java.nio.charset.CharsetEncoder;
31 import java.nio.charset.CoderResult;
32 import java.nio.charset.CodingErrorAction;
33
34 import org.apache.commons.io.Charsets;
35 import org.apache.commons.io.IOUtils;
36 import org.apache.commons.io.build.AbstractStreamBuilder;
37 import org.apache.commons.io.charset.CharsetEncoders;
38
39 /**
40 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
41 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
42 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
43 * <p>
44 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
45 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
46 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
47 * {@link BufferedReader}.
48 * </p>
49 * <p>
50 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
51 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
52 * </p>
53 * <p>
54 * To build an instance, use {@link Builder}.
55 * </p>
56 * <pre>
57 * InputStream inputStream = ...
58 * Charset cs = ...
59 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
60 * ReaderInputStream in2 = ReaderInputStream.builder()
61 * .setReader(reader)
62 * .setCharset(cs)
63 * .get();
64 * </pre>
65 * <p>
66 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
67 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
68 * pulls it from the underlying stream.
69 * </p>
70 * <p>
71 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
72 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
73 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
74 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
75 * </p>
76 * <p>
77 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
78 * </p>
79 * <p>
80 * Instances of {@link ReaderInputStream} are not thread safe.
81 * </p>
82 *
83 * @see Builder
84 * @see org.apache.commons.io.output.WriterOutputStream
85 * @since 2.0
86 */
87 public class ReaderInputStream extends AbstractInputStream {
88
89 // @formatter:off
90 /**
91 * Builds a new {@link ReaderInputStream}.
92 *
93 * <p>
94 * For example:
95 * </p>
96 * <pre>{@code
97 * ReaderInputStream s = ReaderInputStream.builder()
98 * .setPath(path)
99 * .setCharsetEncoder(Charset.defaultCharset().newEncoder())
100 * .get();}
101 * </pre>
102 *
103 * @see #get()
104 * @since 2.12.0
105 */
106 // @formatter:on
107 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
108
109 private CharsetEncoder charsetEncoder = newEncoder(getCharset());
110
111 /**
112 * Constructs a new builder of {@link ReaderInputStream}.
113 */
114 public Builder() {
115 // empty
116 }
117
118 /**
119 * Builds a new {@link ReaderInputStream}.
120 *
121 * <p>
122 * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception.
123 * </p>
124 * <p>
125 * This builder uses the following aspects:
126 * </p>
127 * <ul>
128 * <li>{@link #getReader()} gets the target aspect.</li>
129 * <li>{@link #getBufferSize()}</li>
130 * <li>{@link #getCharset()}</li>
131 * <li>{@link CharsetEncoder}</li>
132 * </ul>
133 *
134 * @return a new instance.
135 * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}.
136 * @throws IllegalStateException if the {@code origin} is {@code null}.
137 * @throws IOException if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}.
138 * @see #getReader()
139 * @see CharsetEncoder
140 * @see #getBufferSize()
141 * @see #getUnchecked()
142 */
143 @Override
144 public ReaderInputStream get() throws IOException {
145 return new ReaderInputStream(this);
146 }
147
148 CharsetEncoder getCharsetEncoder() {
149 return charsetEncoder;
150 }
151
152 @Override
153 public Builder setCharset(final Charset charset) {
154 super.setCharset(charset);
155 charsetEncoder = newEncoder(getCharset());
156 return this;
157 }
158
159 /**
160 * Sets the charset encoder. Assumes that the caller has configured the encoder.
161 *
162 * @param newEncoder the charset encoder, null resets to a default encoder.
163 * @return {@code this} instance.
164 */
165 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
166 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
167 super.setCharset(charsetEncoder.charset());
168 return this;
169 }
170
171 }
172
173 /**
174 * Constructs a new {@link Builder}.
175 *
176 * @return a new {@link Builder}.
177 * @since 2.12.0
178 */
179 public static Builder builder() {
180 return new Builder();
181 }
182
183 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
184 final float minRequired = minBufferSize(charsetEncoder);
185 if (bufferSize < minRequired) {
186 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
187 charsetEncoder.charset().displayName()));
188 }
189 return bufferSize;
190 }
191
192 static float minBufferSize(final CharsetEncoder charsetEncoder) {
193 return charsetEncoder.maxBytesPerChar() * 2;
194 }
195
196 private static CharsetEncoder newEncoder(final Charset charset) {
197 // @formatter:off
198 return Charsets.toCharset(charset).newEncoder()
199 .onMalformedInput(CodingErrorAction.REPLACE)
200 .onUnmappableCharacter(CodingErrorAction.REPLACE);
201 // @formatter:on
202 }
203
204 private final Reader reader;
205
206 private final CharsetEncoder charsetEncoder;
207
208 /**
209 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
210 */
211 private final CharBuffer encoderIn;
212 /**
213 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
214 * caller.
215 */
216 private final ByteBuffer encoderOut;
217
218 private CoderResult lastCoderResult;
219
220 private boolean endOfInput;
221
222 @SuppressWarnings("resource") // caller closes.
223 private ReaderInputStream(final Builder builder) throws IOException {
224 this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize());
225 }
226
227 /**
228 * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@linkplain Charset#defaultCharset() default charset} with a default input
229 * buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
230 *
231 * @param reader the target {@link Reader}
232 * @deprecated Use {@link ReaderInputStream#builder()} instead
233 */
234 @Deprecated
235 public ReaderInputStream(final Reader reader) {
236 this(reader, Charset.defaultCharset());
237 }
238
239 /**
240 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
241 *
242 * <p>
243 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
244 * </p>
245 *
246 * @param reader the target {@link Reader}
247 * @param charset the charset encoding
248 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
249 */
250 @Deprecated
251 public ReaderInputStream(final Reader reader, final Charset charset) {
252 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
253 }
254
255 /**
256 * Constructs a new {@link ReaderInputStream}.
257 *
258 * <p>
259 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
260 * </p>
261 *
262 * @param reader the target {@link Reader}.
263 * @param charset the charset encoding.
264 * @param bufferSize the size of the input buffer in number of characters.
265 * @deprecated Use {@link ReaderInputStream#builder()} instead
266 */
267 @Deprecated
268 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
269 // @formatter:off
270 this(reader,
271 Charsets.toCharset(charset).newEncoder()
272 .onMalformedInput(CodingErrorAction.REPLACE)
273 .onUnmappableCharacter(CodingErrorAction.REPLACE),
274 bufferSize);
275 // @formatter:on
276 }
277
278 /**
279 * Constructs a new {@link ReaderInputStream}.
280 *
281 * <p>
282 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
283 * an encoder which had already been in use.
284 * </p>
285 *
286 * @param reader the target {@link Reader}
287 * @param charsetEncoder the charset encoder
288 * @since 2.1
289 * @deprecated Use {@link ReaderInputStream#builder()} instead
290 */
291 @Deprecated
292 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
293 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
294 }
295
296 /**
297 * Constructs a new {@link ReaderInputStream}.
298 *
299 * <p>
300 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
301 * an encoder which had already been in use.
302 * </p>
303 *
304 * @param reader the target {@link Reader}
305 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
306 * @param bufferSize the size of the input buffer in number of characters
307 * @since 2.1
308 * @deprecated Use {@link ReaderInputStream#builder()} instead
309 */
310 @Deprecated
311 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
312 this.reader = reader;
313 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
314 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
315 this.encoderIn.flip();
316 this.encoderOut = ByteBuffer.allocate(128);
317 this.encoderOut.flip();
318 }
319
320 /**
321 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
322 *
323 * <p>
324 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
325 * </p>
326 *
327 * @param reader the target {@link Reader}
328 * @param charsetName the name of the charset encoding
329 * @deprecated Use {@link ReaderInputStream#builder()} instead
330 */
331 @Deprecated
332 public ReaderInputStream(final Reader reader, final String charsetName) {
333 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
334 }
335
336 /**
337 * Constructs a new {@link ReaderInputStream}.
338 *
339 * <p>
340 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
341 * </p>
342 *
343 * @param reader the target {@link Reader}
344 * @param charsetName the name of the charset encoding, null maps to the default Charset.
345 * @param bufferSize the size of the input buffer in number of characters
346 * @deprecated Use {@link ReaderInputStream#builder()} instead
347 */
348 @Deprecated
349 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
350 this(reader, Charsets.toCharset(charsetName), bufferSize);
351 }
352
353 @Override
354 public int available() throws IOException {
355 if (encoderOut.hasRemaining()) {
356 return encoderOut.remaining();
357 }
358 return 0;
359 }
360
361 /**
362 * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
363 *
364 * @throws IOException if an I/O error occurs.
365 */
366 @Override
367 public void close() throws IOException {
368 reader.close();
369 super.close();
370 }
371
372 /**
373 * Fills the internal char buffer from the reader.
374 *
375 * @throws IOException If an I/O error occurs
376 */
377 private void fillBuffer() throws IOException {
378 if (endOfInput) {
379 return;
380 }
381 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
382 encoderIn.compact();
383 final int position = encoderIn.position();
384 // We don't use Reader#read(CharBuffer) here because it is more efficient
385 // to write directly to the underlying char array (the default implementation
386 // copies data to a temporary char array).
387 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
388 if (c == EOF) {
389 endOfInput = true;
390 } else {
391 encoderIn.position(position + c);
392 }
393 encoderIn.flip();
394 }
395 encoderOut.compact();
396 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
397 if (endOfInput) {
398 lastCoderResult = charsetEncoder.flush(encoderOut);
399 }
400 if (lastCoderResult.isError()) {
401 lastCoderResult.throwException();
402 }
403 encoderOut.flip();
404 }
405
406 /**
407 * Gets the CharsetEncoder.
408 *
409 * @return the CharsetEncoder.
410 */
411 CharsetEncoder getCharsetEncoder() {
412 return charsetEncoder;
413 }
414
415 /**
416 * Reads a single byte.
417 *
418 * @return either the byte read or {@code -1} if the end of the stream has been reached
419 * @throws IOException if an I/O error occurs.
420 */
421 @Override
422 public int read() throws IOException {
423 checkOpen();
424 for (;;) {
425 if (encoderOut.hasRemaining()) {
426 return encoderOut.get() & 0xFF;
427 }
428 fillBuffer();
429 if (endOfInput && !encoderOut.hasRemaining()) {
430 return EOF;
431 }
432 }
433 }
434
435 /**
436 * Reads the specified number of bytes into an array.
437 *
438 * @param b the byte array to read into, must not be {@code null}
439 * @return the number of bytes read or {@code -1} if the end of the stream has been reached
440 * @throws NullPointerException if the byte array is {@code null}.
441 * @throws IOException if an I/O error occurs.
442 */
443 @Override
444 public int read(final byte[] b) throws IOException {
445 return read(b, 0, b.length);
446 }
447
448 /**
449 * Reads the specified number of bytes into an array.
450 *
451 * @param array the byte array to read into
452 * @param off the offset to start reading bytes into
453 * @param len the number of bytes to read
454 * @return the number of bytes read or {@code -1} if the end of the stream has been reached
455 * @throws NullPointerException if the byte array is {@code null}.
456 * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code array.length}.
457 * @throws IOException if an I/O error occurs.
458 */
459 @Override
460 public int read(final byte[] array, int off, int len) throws IOException {
461 IOUtils.checkFromIndexSize(array, off, len);
462 if (len == 0) {
463 return 0; // Always return 0 if len == 0
464 }
465 int read = 0;
466 while (len > 0) {
467 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
468 final int c = Math.min(encoderOut.remaining(), len);
469 encoderOut.get(array, off, c);
470 off += c;
471 len -= c;
472 read += c;
473 } else if (endOfInput) { // Already reach EOF in the last read
474 break;
475 } else { // Read again
476 fillBuffer();
477 }
478 }
479 return read == 0 && endOfInput ? EOF : read;
480 }
481 }