1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.io.input; 18 19 import static org.apache.commons.io.IOUtils.EOF; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.Reader; 24 import java.nio.ByteBuffer; 25 import java.nio.CharBuffer; 26 import java.nio.charset.Charset; 27 import java.nio.charset.CharsetEncoder; 28 import java.nio.charset.CoderResult; 29 import java.nio.charset.CodingErrorAction; 30 import java.util.Objects; 31 32 import org.apache.commons.io.Charsets; 33 import org.apache.commons.io.IOUtils; 34 import org.apache.commons.io.build.AbstractStreamBuilder; 35 import org.apache.commons.io.charset.CharsetEncoders; 36 37 /** 38 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 39 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 40 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 41 * <p> 42 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 43 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 44 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 45 * {@link java.io.BufferedReader}. 46 * </p> 47 * <p> 48 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2} 49 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 50 * </p> 51 * <p> 52 * To build an instance, use {@link Builder}. 53 * </p> 54 * <pre> 55 * InputStream inputStream = ... 56 * Charset cs = ... 57 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 58 * ReaderInputStream in2 = ReaderInputStream.builder() 59 * .setReader(reader) 60 * .setCharset(cs) 61 * .get(); 62 * </pre> 63 * <p> 64 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes 65 * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 66 * pulls it from the underlying stream. 67 * </p> 68 * <p> 69 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 70 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 71 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 72 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 73 * </p> 74 * <p> 75 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 76 * </p> 77 * <p> 78 * Instances of {@link ReaderInputStream} are not thread safe. 79 * </p> 80 * 81 * @see Builder 82 * @see org.apache.commons.io.output.WriterOutputStream 83 * @since 2.0 84 */ 85 public class ReaderInputStream extends InputStream { 86 87 // @formatter:off 88 /** 89 * Builds a new {@link ReaderInputStream}. 90 * 91 * <p> 92 * For example: 93 * </p> 94 * <pre>{@code 95 * ReaderInputStream s = ReaderInputStream.builder() 96 * .setPath(path) 97 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 98 * .get();} 99 * </pre> 100 * 101 * @see #get() 102 * @since 2.12.0 103 */ 104 // @formatter:on 105 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 106 107 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 108 109 /** 110 * Builds a new {@link ReaderInputStream}. 111 * 112 * <p> 113 * You must set input that supports {@link #getReader()}, otherwise, this method throws an exception. 114 * </p> 115 * <p> 116 * This builder use the following aspects: 117 * </p> 118 * <ul> 119 * <li>{@link #getReader()}</li> 120 * <li>{@link #getBufferSize()}</li> 121 * <li>{@link #getCharset()}</li> 122 * <li>{@link CharsetEncoder}</li> 123 * </ul> 124 * 125 * @return a new instance. 126 * @throws UnsupportedOperationException if the origin cannot provide a Reader. 127 * @throws IllegalStateException if the {@code origin} is {@code null}. 128 * @see #getReader() 129 * @see CharsetEncoder 130 * @see #getBufferSize() 131 */ 132 @SuppressWarnings("resource") 133 @Override 134 public ReaderInputStream get() throws IOException { 135 return new ReaderInputStream(getReader(), charsetEncoder, getBufferSize()); 136 } 137 138 CharsetEncoder getCharsetEncoder() { 139 return charsetEncoder; 140 } 141 142 @Override 143 public Builder setCharset(final Charset charset) { 144 super.setCharset(charset); 145 charsetEncoder = newEncoder(getCharset()); 146 return this; 147 } 148 149 /** 150 * Sets the charset encoder. Assumes that the caller has configured the encoder. 151 * 152 * @param newEncoder the charset encoder, null resets to a default encoder. 153 * @return this 154 */ 155 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 156 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 157 super.setCharset(charsetEncoder.charset()); 158 return this; 159 } 160 161 } 162 163 /** 164 * Constructs a new {@link Builder}. 165 * 166 * @return a new {@link Builder}. 167 * @since 2.12.0 168 */ 169 public static Builder builder() { 170 return new Builder(); 171 } 172 173 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 174 final float minRequired = minBufferSize(charsetEncoder); 175 if (bufferSize < minRequired) { 176 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 177 charsetEncoder.charset().displayName())); 178 } 179 return bufferSize; 180 } 181 182 static float minBufferSize(final CharsetEncoder charsetEncoder) { 183 return charsetEncoder.maxBytesPerChar() * 2; 184 } 185 186 private static CharsetEncoder newEncoder(final Charset charset) { 187 // @formatter:off 188 return Charsets.toCharset(charset).newEncoder() 189 .onMalformedInput(CodingErrorAction.REPLACE) 190 .onUnmappableCharacter(CodingErrorAction.REPLACE); 191 // @formatter:on 192 } 193 194 private final Reader reader; 195 196 private final CharsetEncoder charsetEncoder; 197 198 /** 199 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 200 */ 201 private final CharBuffer encoderIn; 202 /** 203 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 204 * caller. 205 */ 206 private final ByteBuffer encoderOut; 207 208 private CoderResult lastCoderResult; 209 210 private boolean endOfInput; 211 212 /** 213 * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of 214 * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 215 * 216 * @param reader the target {@link Reader} 217 * @deprecated Use {@link ReaderInputStream#builder()} instead 218 */ 219 @Deprecated 220 public ReaderInputStream(final Reader reader) { 221 this(reader, Charset.defaultCharset()); 222 } 223 224 /** 225 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 226 * 227 * <p> 228 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 229 * </p> 230 * 231 * @param reader the target {@link Reader} 232 * @param charset the charset encoding 233 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 234 */ 235 @Deprecated 236 public ReaderInputStream(final Reader reader, final Charset charset) { 237 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 238 } 239 240 /** 241 * Constructs a new {@link ReaderInputStream}. 242 * 243 * <p> 244 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 245 * </p> 246 * 247 * @param reader the target {@link Reader}. 248 * @param charset the charset encoding. 249 * @param bufferSize the size of the input buffer in number of characters. 250 * @deprecated Use {@link ReaderInputStream#builder()} instead 251 */ 252 @Deprecated 253 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 254 // @formatter:off 255 this(reader, 256 Charsets.toCharset(charset).newEncoder() 257 .onMalformedInput(CodingErrorAction.REPLACE) 258 .onUnmappableCharacter(CodingErrorAction.REPLACE), 259 bufferSize); 260 // @formatter:on 261 } 262 263 /** 264 * Constructs a new {@link ReaderInputStream}. 265 * 266 * <p> 267 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 268 * an encoder which had already been in use. 269 * </p> 270 * 271 * @param reader the target {@link Reader} 272 * @param charsetEncoder the charset encoder 273 * @since 2.1 274 * @deprecated Use {@link ReaderInputStream#builder()} instead 275 */ 276 @Deprecated 277 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 278 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 279 } 280 281 /** 282 * Constructs a new {@link ReaderInputStream}. 283 * 284 * <p> 285 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 286 * an encoder which had already been in use. 287 * </p> 288 * 289 * @param reader the target {@link Reader} 290 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 291 * @param bufferSize the size of the input buffer in number of characters 292 * @since 2.1 293 * @deprecated Use {@link ReaderInputStream#builder()} instead 294 */ 295 @Deprecated 296 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 297 this.reader = reader; 298 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 299 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 300 this.encoderIn.flip(); 301 this.encoderOut = ByteBuffer.allocate(128); 302 this.encoderOut.flip(); 303 } 304 305 /** 306 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 307 * 308 * <p> 309 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 310 * </p> 311 * 312 * @param reader the target {@link Reader} 313 * @param charsetName the name of the charset encoding 314 * @deprecated Use {@link ReaderInputStream#builder()} instead 315 */ 316 @Deprecated 317 public ReaderInputStream(final Reader reader, final String charsetName) { 318 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 319 } 320 321 /** 322 * Constructs a new {@link ReaderInputStream}. 323 * 324 * <p> 325 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 326 * </p> 327 * 328 * @param reader the target {@link Reader} 329 * @param charsetName the name of the charset encoding, null maps to the default Charset. 330 * @param bufferSize the size of the input buffer in number of characters 331 * @deprecated Use {@link ReaderInputStream#builder()} instead 332 */ 333 @Deprecated 334 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 335 this(reader, Charsets.toCharset(charsetName), bufferSize); 336 } 337 338 /** 339 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 340 * 341 * @throws IOException if an I/O error occurs. 342 */ 343 @Override 344 public void close() throws IOException { 345 reader.close(); 346 } 347 348 /** 349 * Fills the internal char buffer from the reader. 350 * 351 * @throws IOException If an I/O error occurs 352 */ 353 private void fillBuffer() throws IOException { 354 if (endOfInput) { 355 return; 356 } 357 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 358 encoderIn.compact(); 359 final int position = encoderIn.position(); 360 // We don't use Reader#read(CharBuffer) here because it is more efficient 361 // to write directly to the underlying char array (the default implementation 362 // copies data to a temporary char array). 363 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 364 if (c == EOF) { 365 endOfInput = true; 366 } else { 367 encoderIn.position(position + c); 368 } 369 encoderIn.flip(); 370 } 371 encoderOut.compact(); 372 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 373 if (endOfInput) { 374 lastCoderResult = charsetEncoder.flush(encoderOut); 375 } 376 if (lastCoderResult.isError()) { 377 lastCoderResult.throwException(); 378 } 379 encoderOut.flip(); 380 } 381 382 /** 383 * Gets the CharsetEncoder. 384 * 385 * @return the CharsetEncoder. 386 */ 387 CharsetEncoder getCharsetEncoder() { 388 return charsetEncoder; 389 } 390 391 /** 392 * Reads a single byte. 393 * 394 * @return either the byte read or {@code -1} if the end of the stream has been reached 395 * @throws IOException if an I/O error occurs. 396 */ 397 @Override 398 public int read() throws IOException { 399 for (;;) { 400 if (encoderOut.hasRemaining()) { 401 return encoderOut.get() & 0xFF; 402 } 403 fillBuffer(); 404 if (endOfInput && !encoderOut.hasRemaining()) { 405 return EOF; 406 } 407 } 408 } 409 410 /** 411 * Reads the specified number of bytes into an array. 412 * 413 * @param b the byte array to read into 414 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 415 * @throws IOException if an I/O error occurs. 416 */ 417 @Override 418 public int read(final byte[] b) throws IOException { 419 return read(b, 0, b.length); 420 } 421 422 /** 423 * Reads the specified number of bytes into an array. 424 * 425 * @param array the byte array to read into 426 * @param off the offset to start reading bytes into 427 * @param len the number of bytes to read 428 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 429 * @throws IOException if an I/O error occurs. 430 */ 431 @Override 432 public int read(final byte[] array, int off, int len) throws IOException { 433 Objects.requireNonNull(array, "array"); 434 if (len < 0 || off < 0 || off + len > array.length) { 435 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 436 } 437 int read = 0; 438 if (len == 0) { 439 return 0; // Always return 0 if len == 0 440 } 441 while (len > 0) { 442 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 443 final int c = Math.min(encoderOut.remaining(), len); 444 encoderOut.get(array, off, c); 445 off += c; 446 len -= c; 447 read += c; 448 } else if (endOfInput) { // Already reach EOF in the last read 449 break; 450 } else { // Read again 451 fillBuffer(); 452 } 453 } 454 return read == 0 && endOfInput ? EOF : read; 455 } 456 }