001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.OutputStreamWriter; 026import java.io.Reader; 027import java.nio.ByteBuffer; 028import java.nio.CharBuffer; 029import java.nio.charset.Charset; 030import java.nio.charset.CharsetEncoder; 031import java.nio.charset.CoderResult; 032import java.nio.charset.CodingErrorAction; 033import java.util.Objects; 034 035import org.apache.commons.io.Charsets; 036import org.apache.commons.io.IOUtils; 037import org.apache.commons.io.build.AbstractStreamBuilder; 038import org.apache.commons.io.charset.CharsetEncoders; 039 040/** 041 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 042 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 043 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 044 * <p> 045 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 046 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 047 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 048 * {@link BufferedReader}. 049 * </p> 050 * <p> 051 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2} 052 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 053 * </p> 054 * <p> 055 * To build an instance, use {@link Builder}. 056 * </p> 057 * <pre> 058 * InputStream inputStream = ... 059 * Charset cs = ... 060 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 061 * ReaderInputStream in2 = ReaderInputStream.builder() 062 * .setReader(reader) 063 * .setCharset(cs) 064 * .get(); 065 * </pre> 066 * <p> 067 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes 068 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 069 * pulls it from the underlying stream. 070 * </p> 071 * <p> 072 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 073 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 074 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 075 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 076 * </p> 077 * <p> 078 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 079 * </p> 080 * <p> 081 * Instances of {@link ReaderInputStream} are not thread safe. 082 * </p> 083 * 084 * @see Builder 085 * @see org.apache.commons.io.output.WriterOutputStream 086 * @since 2.0 087 */ 088public class ReaderInputStream extends AbstractInputStream { 089 090 // @formatter:off 091 /** 092 * Builds a new {@link ReaderInputStream}. 093 * 094 * <p> 095 * For example: 096 * </p> 097 * <pre>{@code 098 * ReaderInputStream s = ReaderInputStream.builder() 099 * .setPath(path) 100 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 101 * .get();} 102 * </pre> 103 * 104 * @see #get() 105 * @since 2.12.0 106 */ 107 // @formatter:on 108 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 109 110 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 111 112 /** 113 * Constructs a new builder of {@link ReaderInputStream}. 114 */ 115 public Builder() { 116 // empty 117 } 118 119 /** 120 * Builds a new {@link ReaderInputStream}. 121 * 122 * <p> 123 * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception. 124 * </p> 125 * <p> 126 * This builder uses the following aspects: 127 * </p> 128 * <ul> 129 * <li>{@link #getReader()} gets the target aspect.</li> 130 * <li>{@link #getBufferSize()}</li> 131 * <li>{@link #getCharset()}</li> 132 * <li>{@link CharsetEncoder}</li> 133 * </ul> 134 * 135 * @return a new instance. 136 * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}. 137 * @throws IllegalStateException if the {@code origin} is {@code null}. 138 * @throws IOException if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}. 139 * @see #getReader() 140 * @see CharsetEncoder 141 * @see #getBufferSize() 142 * @see #getUnchecked() 143 */ 144 @Override 145 public ReaderInputStream get() throws IOException { 146 return new ReaderInputStream(this); 147 } 148 149 CharsetEncoder getCharsetEncoder() { 150 return charsetEncoder; 151 } 152 153 @Override 154 public Builder setCharset(final Charset charset) { 155 super.setCharset(charset); 156 charsetEncoder = newEncoder(getCharset()); 157 return this; 158 } 159 160 /** 161 * Sets the charset encoder. Assumes that the caller has configured the encoder. 162 * 163 * @param newEncoder the charset encoder, null resets to a default encoder. 164 * @return {@code this} instance. 165 */ 166 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 167 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 168 super.setCharset(charsetEncoder.charset()); 169 return this; 170 } 171 172 } 173 174 /** 175 * Constructs a new {@link Builder}. 176 * 177 * @return a new {@link Builder}. 178 * @since 2.12.0 179 */ 180 public static Builder builder() { 181 return new Builder(); 182 } 183 184 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 185 final float minRequired = minBufferSize(charsetEncoder); 186 if (bufferSize < minRequired) { 187 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 188 charsetEncoder.charset().displayName())); 189 } 190 return bufferSize; 191 } 192 193 static float minBufferSize(final CharsetEncoder charsetEncoder) { 194 return charsetEncoder.maxBytesPerChar() * 2; 195 } 196 197 private static CharsetEncoder newEncoder(final Charset charset) { 198 // @formatter:off 199 return Charsets.toCharset(charset).newEncoder() 200 .onMalformedInput(CodingErrorAction.REPLACE) 201 .onUnmappableCharacter(CodingErrorAction.REPLACE); 202 // @formatter:on 203 } 204 205 private final Reader reader; 206 207 private final CharsetEncoder charsetEncoder; 208 209 /** 210 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 211 */ 212 private final CharBuffer encoderIn; 213 /** 214 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 215 * caller. 216 */ 217 private final ByteBuffer encoderOut; 218 219 private CoderResult lastCoderResult; 220 221 private boolean endOfInput; 222 223 @SuppressWarnings("resource") // caller closes. 224 private ReaderInputStream(final Builder builder) throws IOException { 225 this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize()); 226 } 227 228 /** 229 * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@link Charset#defaultCharset() default charset} with a default input buffer 230 * size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 231 * 232 * @param reader the target {@link Reader} 233 * @deprecated Use {@link ReaderInputStream#builder()} instead 234 */ 235 @Deprecated 236 public ReaderInputStream(final Reader reader) { 237 this(reader, Charset.defaultCharset()); 238 } 239 240 /** 241 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 242 * 243 * <p> 244 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 245 * </p> 246 * 247 * @param reader the target {@link Reader} 248 * @param charset the charset encoding 249 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 250 */ 251 @Deprecated 252 public ReaderInputStream(final Reader reader, final Charset charset) { 253 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 254 } 255 256 /** 257 * Constructs a new {@link ReaderInputStream}. 258 * 259 * <p> 260 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 261 * </p> 262 * 263 * @param reader the target {@link Reader}. 264 * @param charset the charset encoding. 265 * @param bufferSize the size of the input buffer in number of characters. 266 * @deprecated Use {@link ReaderInputStream#builder()} instead 267 */ 268 @Deprecated 269 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 270 // @formatter:off 271 this(reader, 272 Charsets.toCharset(charset).newEncoder() 273 .onMalformedInput(CodingErrorAction.REPLACE) 274 .onUnmappableCharacter(CodingErrorAction.REPLACE), 275 bufferSize); 276 // @formatter:on 277 } 278 279 /** 280 * Constructs a new {@link ReaderInputStream}. 281 * 282 * <p> 283 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 284 * an encoder which had already been in use. 285 * </p> 286 * 287 * @param reader the target {@link Reader} 288 * @param charsetEncoder the charset encoder 289 * @since 2.1 290 * @deprecated Use {@link ReaderInputStream#builder()} instead 291 */ 292 @Deprecated 293 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 294 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 295 } 296 297 /** 298 * Constructs a new {@link ReaderInputStream}. 299 * 300 * <p> 301 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 302 * an encoder which had already been in use. 303 * </p> 304 * 305 * @param reader the target {@link Reader} 306 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 307 * @param bufferSize the size of the input buffer in number of characters 308 * @since 2.1 309 * @deprecated Use {@link ReaderInputStream#builder()} instead 310 */ 311 @Deprecated 312 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 313 this.reader = reader; 314 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 315 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 316 this.encoderIn.flip(); 317 this.encoderOut = ByteBuffer.allocate(128); 318 this.encoderOut.flip(); 319 } 320 321 /** 322 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 323 * 324 * <p> 325 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 326 * </p> 327 * 328 * @param reader the target {@link Reader} 329 * @param charsetName the name of the charset encoding 330 * @deprecated Use {@link ReaderInputStream#builder()} instead 331 */ 332 @Deprecated 333 public ReaderInputStream(final Reader reader, final String charsetName) { 334 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 335 } 336 337 /** 338 * Constructs a new {@link ReaderInputStream}. 339 * 340 * <p> 341 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 342 * </p> 343 * 344 * @param reader the target {@link Reader} 345 * @param charsetName the name of the charset encoding, null maps to the default Charset. 346 * @param bufferSize the size of the input buffer in number of characters 347 * @deprecated Use {@link ReaderInputStream#builder()} instead 348 */ 349 @Deprecated 350 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 351 this(reader, Charsets.toCharset(charsetName), bufferSize); 352 } 353 354 @Override 355 public int available() throws IOException { 356 if (encoderOut.hasRemaining()) { 357 return encoderOut.remaining(); 358 } 359 return 0; 360 } 361 362 /** 363 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 364 * 365 * @throws IOException if an I/O error occurs. 366 */ 367 @Override 368 public void close() throws IOException { 369 reader.close(); 370 super.close(); 371 } 372 373 /** 374 * Fills the internal char buffer from the reader. 375 * 376 * @throws IOException If an I/O error occurs 377 */ 378 private void fillBuffer() throws IOException { 379 if (endOfInput) { 380 return; 381 } 382 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 383 encoderIn.compact(); 384 final int position = encoderIn.position(); 385 // We don't use Reader#read(CharBuffer) here because it is more efficient 386 // to write directly to the underlying char array (the default implementation 387 // copies data to a temporary char array). 388 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 389 if (c == EOF) { 390 endOfInput = true; 391 } else { 392 encoderIn.position(position + c); 393 } 394 encoderIn.flip(); 395 } 396 encoderOut.compact(); 397 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 398 if (endOfInput) { 399 lastCoderResult = charsetEncoder.flush(encoderOut); 400 } 401 if (lastCoderResult.isError()) { 402 lastCoderResult.throwException(); 403 } 404 encoderOut.flip(); 405 } 406 407 /** 408 * Gets the CharsetEncoder. 409 * 410 * @return the CharsetEncoder. 411 */ 412 CharsetEncoder getCharsetEncoder() { 413 return charsetEncoder; 414 } 415 416 /** 417 * Reads a single byte. 418 * 419 * @return either the byte read or {@code -1} if the end of the stream has been reached 420 * @throws IOException if an I/O error occurs. 421 */ 422 @Override 423 public int read() throws IOException { 424 checkOpen(); 425 for (;;) { 426 if (encoderOut.hasRemaining()) { 427 return encoderOut.get() & 0xFF; 428 } 429 fillBuffer(); 430 if (endOfInput && !encoderOut.hasRemaining()) { 431 return EOF; 432 } 433 } 434 } 435 436 /** 437 * Reads the specified number of bytes into an array. 438 * 439 * @param b the byte array to read into 440 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 441 * @throws IOException if an I/O error occurs. 442 */ 443 @Override 444 public int read(final byte[] b) throws IOException { 445 return read(b, 0, b.length); 446 } 447 448 /** 449 * Reads the specified number of bytes into an array. 450 * 451 * @param array the byte array to read into 452 * @param off the offset to start reading bytes into 453 * @param len the number of bytes to read 454 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 455 * @throws IOException if an I/O error occurs. 456 */ 457 @Override 458 public int read(final byte[] array, int off, int len) throws IOException { 459 Objects.requireNonNull(array, "array"); 460 if (len < 0 || off < 0 || off + len > array.length) { 461 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 462 } 463 int read = 0; 464 if (len == 0) { 465 return 0; // Always return 0 if len == 0 466 } 467 while (len > 0) { 468 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 469 final int c = Math.min(encoderOut.remaining(), len); 470 encoderOut.get(array, off, c); 471 off += c; 472 len -= c; 473 read += c; 474 } else if (endOfInput) { // Already reach EOF in the last read 475 break; 476 } else { // Read again 477 fillBuffer(); 478 } 479 } 480 return read == 0 && endOfInput ? EOF : read; 481 } 482}