001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.OutputStreamWriter; 026import java.io.Reader; 027import java.nio.ByteBuffer; 028import java.nio.CharBuffer; 029import java.nio.charset.Charset; 030import java.nio.charset.CharsetEncoder; 031import java.nio.charset.CoderResult; 032import java.nio.charset.CodingErrorAction; 033 034import org.apache.commons.io.Charsets; 035import org.apache.commons.io.IOUtils; 036import org.apache.commons.io.build.AbstractStreamBuilder; 037import org.apache.commons.io.charset.CharsetEncoders; 038 039/** 040 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 041 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 042 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 043 * <p> 044 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 045 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 046 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 047 * {@link BufferedReader}. 048 * </p> 049 * <p> 050 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2} 051 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 052 * </p> 053 * <p> 054 * To build an instance, use {@link Builder}. 055 * </p> 056 * <pre> 057 * InputStream inputStream = ... 058 * Charset cs = ... 059 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 060 * ReaderInputStream in2 = ReaderInputStream.builder() 061 * .setReader(reader) 062 * .setCharset(cs) 063 * .get(); 064 * </pre> 065 * <p> 066 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes 067 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 068 * pulls it from the underlying stream. 069 * </p> 070 * <p> 071 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 072 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 073 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 074 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 075 * </p> 076 * <p> 077 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 078 * </p> 079 * <p> 080 * Instances of {@link ReaderInputStream} are not thread safe. 081 * </p> 082 * 083 * @see Builder 084 * @see org.apache.commons.io.output.WriterOutputStream 085 * @since 2.0 086 */ 087public class ReaderInputStream extends AbstractInputStream { 088 089 // @formatter:off 090 /** 091 * Builds a new {@link ReaderInputStream}. 092 * 093 * <p> 094 * For example: 095 * </p> 096 * <pre>{@code 097 * ReaderInputStream s = ReaderInputStream.builder() 098 * .setPath(path) 099 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 100 * .get();} 101 * </pre> 102 * 103 * @see #get() 104 * @since 2.12.0 105 */ 106 // @formatter:on 107 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 108 109 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 110 111 /** 112 * Constructs a new builder of {@link ReaderInputStream}. 113 */ 114 public Builder() { 115 // empty 116 } 117 118 /** 119 * Builds a new {@link ReaderInputStream}. 120 * 121 * <p> 122 * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception. 123 * </p> 124 * <p> 125 * This builder uses the following aspects: 126 * </p> 127 * <ul> 128 * <li>{@link #getReader()} gets the target aspect.</li> 129 * <li>{@link #getBufferSize()}</li> 130 * <li>{@link #getCharset()}</li> 131 * <li>{@link CharsetEncoder}</li> 132 * </ul> 133 * 134 * @return a new instance. 135 * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}. 136 * @throws IllegalStateException if the {@code origin} is {@code null}. 137 * @throws IOException if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}. 138 * @see #getReader() 139 * @see CharsetEncoder 140 * @see #getBufferSize() 141 * @see #getUnchecked() 142 */ 143 @Override 144 public ReaderInputStream get() throws IOException { 145 return new ReaderInputStream(this); 146 } 147 148 CharsetEncoder getCharsetEncoder() { 149 return charsetEncoder; 150 } 151 152 @Override 153 public Builder setCharset(final Charset charset) { 154 super.setCharset(charset); 155 charsetEncoder = newEncoder(getCharset()); 156 return this; 157 } 158 159 /** 160 * Sets the charset encoder. Assumes that the caller has configured the encoder. 161 * 162 * @param newEncoder the charset encoder, null resets to a default encoder. 163 * @return {@code this} instance. 164 */ 165 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 166 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 167 super.setCharset(charsetEncoder.charset()); 168 return this; 169 } 170 171 } 172 173 /** 174 * Constructs a new {@link Builder}. 175 * 176 * @return a new {@link Builder}. 177 * @since 2.12.0 178 */ 179 public static Builder builder() { 180 return new Builder(); 181 } 182 183 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 184 final float minRequired = minBufferSize(charsetEncoder); 185 if (bufferSize < minRequired) { 186 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 187 charsetEncoder.charset().displayName())); 188 } 189 return bufferSize; 190 } 191 192 static float minBufferSize(final CharsetEncoder charsetEncoder) { 193 return charsetEncoder.maxBytesPerChar() * 2; 194 } 195 196 private static CharsetEncoder newEncoder(final Charset charset) { 197 // @formatter:off 198 return Charsets.toCharset(charset).newEncoder() 199 .onMalformedInput(CodingErrorAction.REPLACE) 200 .onUnmappableCharacter(CodingErrorAction.REPLACE); 201 // @formatter:on 202 } 203 204 private final Reader reader; 205 206 private final CharsetEncoder charsetEncoder; 207 208 /** 209 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 210 */ 211 private final CharBuffer encoderIn; 212 /** 213 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 214 * caller. 215 */ 216 private final ByteBuffer encoderOut; 217 218 private CoderResult lastCoderResult; 219 220 private boolean endOfInput; 221 222 @SuppressWarnings("resource") // caller closes. 223 private ReaderInputStream(final Builder builder) throws IOException { 224 this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize()); 225 } 226 227 /** 228 * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@linkplain Charset#defaultCharset() default charset} with a default input 229 * buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 230 * 231 * @param reader the target {@link Reader} 232 * @deprecated Use {@link ReaderInputStream#builder()} instead 233 */ 234 @Deprecated 235 public ReaderInputStream(final Reader reader) { 236 this(reader, Charset.defaultCharset()); 237 } 238 239 /** 240 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 241 * 242 * <p> 243 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 244 * </p> 245 * 246 * @param reader the target {@link Reader} 247 * @param charset the charset encoding 248 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 249 */ 250 @Deprecated 251 public ReaderInputStream(final Reader reader, final Charset charset) { 252 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 253 } 254 255 /** 256 * Constructs a new {@link ReaderInputStream}. 257 * 258 * <p> 259 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 260 * </p> 261 * 262 * @param reader the target {@link Reader}. 263 * @param charset the charset encoding. 264 * @param bufferSize the size of the input buffer in number of characters. 265 * @deprecated Use {@link ReaderInputStream#builder()} instead 266 */ 267 @Deprecated 268 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 269 // @formatter:off 270 this(reader, 271 Charsets.toCharset(charset).newEncoder() 272 .onMalformedInput(CodingErrorAction.REPLACE) 273 .onUnmappableCharacter(CodingErrorAction.REPLACE), 274 bufferSize); 275 // @formatter:on 276 } 277 278 /** 279 * Constructs a new {@link ReaderInputStream}. 280 * 281 * <p> 282 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 283 * an encoder which had already been in use. 284 * </p> 285 * 286 * @param reader the target {@link Reader} 287 * @param charsetEncoder the charset encoder 288 * @since 2.1 289 * @deprecated Use {@link ReaderInputStream#builder()} instead 290 */ 291 @Deprecated 292 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 293 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 294 } 295 296 /** 297 * Constructs a new {@link ReaderInputStream}. 298 * 299 * <p> 300 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 301 * an encoder which had already been in use. 302 * </p> 303 * 304 * @param reader the target {@link Reader} 305 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 306 * @param bufferSize the size of the input buffer in number of characters 307 * @since 2.1 308 * @deprecated Use {@link ReaderInputStream#builder()} instead 309 */ 310 @Deprecated 311 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 312 this.reader = reader; 313 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 314 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 315 this.encoderIn.flip(); 316 this.encoderOut = ByteBuffer.allocate(128); 317 this.encoderOut.flip(); 318 } 319 320 /** 321 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 322 * 323 * <p> 324 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 325 * </p> 326 * 327 * @param reader the target {@link Reader} 328 * @param charsetName the name of the charset encoding 329 * @deprecated Use {@link ReaderInputStream#builder()} instead 330 */ 331 @Deprecated 332 public ReaderInputStream(final Reader reader, final String charsetName) { 333 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 334 } 335 336 /** 337 * Constructs a new {@link ReaderInputStream}. 338 * 339 * <p> 340 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 341 * </p> 342 * 343 * @param reader the target {@link Reader} 344 * @param charsetName the name of the charset encoding, null maps to the default Charset. 345 * @param bufferSize the size of the input buffer in number of characters 346 * @deprecated Use {@link ReaderInputStream#builder()} instead 347 */ 348 @Deprecated 349 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 350 this(reader, Charsets.toCharset(charsetName), bufferSize); 351 } 352 353 @Override 354 public int available() throws IOException { 355 if (encoderOut.hasRemaining()) { 356 return encoderOut.remaining(); 357 } 358 return 0; 359 } 360 361 /** 362 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 363 * 364 * @throws IOException if an I/O error occurs. 365 */ 366 @Override 367 public void close() throws IOException { 368 reader.close(); 369 super.close(); 370 } 371 372 /** 373 * Fills the internal char buffer from the reader. 374 * 375 * @throws IOException If an I/O error occurs 376 */ 377 private void fillBuffer() throws IOException { 378 if (endOfInput) { 379 return; 380 } 381 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 382 encoderIn.compact(); 383 final int position = encoderIn.position(); 384 // We don't use Reader#read(CharBuffer) here because it is more efficient 385 // to write directly to the underlying char array (the default implementation 386 // copies data to a temporary char array). 387 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 388 if (c == EOF) { 389 endOfInput = true; 390 } else { 391 encoderIn.position(position + c); 392 } 393 encoderIn.flip(); 394 } 395 encoderOut.compact(); 396 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 397 if (endOfInput) { 398 lastCoderResult = charsetEncoder.flush(encoderOut); 399 } 400 if (lastCoderResult.isError()) { 401 lastCoderResult.throwException(); 402 } 403 encoderOut.flip(); 404 } 405 406 /** 407 * Gets the CharsetEncoder. 408 * 409 * @return the CharsetEncoder. 410 */ 411 CharsetEncoder getCharsetEncoder() { 412 return charsetEncoder; 413 } 414 415 /** 416 * Reads a single byte. 417 * 418 * @return either the byte read or {@code -1} if the end of the stream has been reached 419 * @throws IOException if an I/O error occurs. 420 */ 421 @Override 422 public int read() throws IOException { 423 checkOpen(); 424 for (;;) { 425 if (encoderOut.hasRemaining()) { 426 return encoderOut.get() & 0xFF; 427 } 428 fillBuffer(); 429 if (endOfInput && !encoderOut.hasRemaining()) { 430 return EOF; 431 } 432 } 433 } 434 435 /** 436 * Reads the specified number of bytes into an array. 437 * 438 * @param b the byte array to read into, must not be {@code null} 439 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 440 * @throws NullPointerException if the byte array is {@code null}. 441 * @throws IOException if an I/O error occurs. 442 */ 443 @Override 444 public int read(final byte[] b) throws IOException { 445 return read(b, 0, b.length); 446 } 447 448 /** 449 * Reads the specified number of bytes into an array. 450 * 451 * @param array the byte array to read into 452 * @param off the offset to start reading bytes into 453 * @param len the number of bytes to read 454 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 455 * @throws NullPointerException if the byte array is {@code null}. 456 * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code array.length}. 457 * @throws IOException if an I/O error occurs. 458 */ 459 @Override 460 public int read(final byte[] array, int off, int len) throws IOException { 461 IOUtils.checkFromIndexSize(array, off, len); 462 if (len == 0) { 463 return 0; // Always return 0 if len == 0 464 } 465 int read = 0; 466 while (len > 0) { 467 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 468 final int c = Math.min(encoderOut.remaining(), len); 469 encoderOut.get(array, off, c); 470 off += c; 471 len -= c; 472 read += c; 473 } else if (endOfInput) { // Already reach EOF in the last read 474 break; 475 } else { // Read again 476 fillBuffer(); 477 } 478 } 479 return read == 0 && endOfInput ? EOF : read; 480 } 481}