1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.io.input; 18 19 import static org.apache.commons.io.IOUtils.EOF; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.Reader; 24 import java.nio.ByteBuffer; 25 import java.nio.CharBuffer; 26 import java.nio.charset.Charset; 27 import java.nio.charset.CharsetEncoder; 28 import java.nio.charset.CoderResult; 29 import java.nio.charset.CodingErrorAction; 30 import java.util.Objects; 31 32 import org.apache.commons.io.Charsets; 33 import org.apache.commons.io.IOUtils; 34 import org.apache.commons.io.build.AbstractOrigin; 35 import org.apache.commons.io.build.AbstractStreamBuilder; 36 import org.apache.commons.io.charset.CharsetEncoders; 37 38 /** 39 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 40 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 41 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 42 * <p> 43 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 44 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 45 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 46 * {@link java.io.BufferedReader}. 47 * </p> 48 * <p> 49 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2} 50 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 51 * </p> 52 * <p> 53 * To build an instance, see {@link Builder}. 54 * </p> 55 * <pre> 56 * InputStream inputStream = ... 57 * Charset cs = ... 58 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 59 * ReaderInputStream in2 = ReaderInputStream.builder() 60 * .setReader(reader) 61 * .setCharset(cs) 62 * .get(); 63 * </pre> 64 * <p> 65 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes 66 * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 67 * pulls it from the underlying stream. 68 * </p> 69 * <p> 70 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 71 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 72 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 73 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 74 * </p> 75 * <p> 76 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 77 * </p> 78 * <p> 79 * Instances of {@link ReaderInputStream} are not thread safe. 80 * </p> 81 * 82 * @see org.apache.commons.io.output.WriterOutputStream 83 * @since 2.0 84 */ 85 public class ReaderInputStream extends InputStream { 86 87 /** 88 * Builds a new {@link ReaderInputStream} instance. 89 * <p> 90 * For example: 91 * </p> 92 * <pre>{@code 93 * ReaderInputStream s = ReaderInputStream.builder() 94 * .setPath(path) 95 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 96 * .get();} 97 * </pre> 98 * 99 * @since 2.12.0 100 */ 101 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 102 103 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 104 105 /** 106 * Constructs a new instance. 107 * <p> 108 * This builder use the aspects Reader, Charset, CharsetEncoder, buffer size. 109 * </p> 110 * <p> 111 * You must provide an origin that can be converted to a Reader by this builder, otherwise, this call will throw an 112 * {@link UnsupportedOperationException}. 113 * </p> 114 * 115 * @return a new instance. 116 * @throws UnsupportedOperationException if the origin cannot provide a Reader. 117 * @throws IllegalStateException if the {@code origin} is {@code null}. 118 * @see AbstractOrigin#getReader(Charset) 119 */ 120 @SuppressWarnings("resource") 121 @Override 122 public ReaderInputStream get() throws IOException { 123 return new ReaderInputStream(checkOrigin().getReader(getCharset()), charsetEncoder, getBufferSize()); 124 } 125 126 CharsetEncoder getCharsetEncoder() { 127 return charsetEncoder; 128 } 129 130 @Override 131 public Builder setCharset(final Charset charset) { 132 super.setCharset(charset); 133 charsetEncoder = newEncoder(getCharset()); 134 return this; 135 } 136 137 /** 138 * Sets the charset encoder. Assumes that the caller has configured the encoder. 139 * 140 * @param newEncoder the charset encoder, null resets to a default encoder. 141 * @return this 142 */ 143 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 144 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 145 super.setCharset(charsetEncoder.charset()); 146 return this; 147 } 148 149 } 150 151 /** 152 * Constructs a new {@link Builder}. 153 * 154 * @return a new {@link Builder}. 155 * @since 2.12.0 156 */ 157 public static Builder builder() { 158 return new Builder(); 159 } 160 161 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 162 final float minRequired = minBufferSize(charsetEncoder); 163 if (bufferSize < minRequired) { 164 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 165 charsetEncoder.charset().displayName())); 166 } 167 return bufferSize; 168 } 169 170 static float minBufferSize(final CharsetEncoder charsetEncoder) { 171 return charsetEncoder.maxBytesPerChar() * 2; 172 } 173 174 private static CharsetEncoder newEncoder(final Charset charset) { 175 // @formatter:off 176 return Charsets.toCharset(charset).newEncoder() 177 .onMalformedInput(CodingErrorAction.REPLACE) 178 .onUnmappableCharacter(CodingErrorAction.REPLACE); 179 // @formatter:on 180 } 181 182 private final Reader reader; 183 184 private final CharsetEncoder charsetEncoder; 185 186 /** 187 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 188 */ 189 private final CharBuffer encoderIn; 190 /** 191 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 192 * caller. 193 */ 194 private final ByteBuffer encoderOut; 195 196 private CoderResult lastCoderResult; 197 198 private boolean endOfInput; 199 200 /** 201 * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of 202 * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 203 * 204 * @param reader the target {@link Reader} 205 * @deprecated Use {@link ReaderInputStream#builder()} instead 206 */ 207 @Deprecated 208 public ReaderInputStream(final Reader reader) { 209 this(reader, Charset.defaultCharset()); 210 } 211 212 /** 213 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 214 * 215 * <p> 216 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 217 * </p> 218 * 219 * @param reader the target {@link Reader} 220 * @param charset the charset encoding 221 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 222 */ 223 @Deprecated 224 public ReaderInputStream(final Reader reader, final Charset charset) { 225 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 226 } 227 228 /** 229 * Constructs a new {@link ReaderInputStream}. 230 * 231 * <p> 232 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 233 * </p> 234 * 235 * @param reader the target {@link Reader}. 236 * @param charset the charset encoding. 237 * @param bufferSize the size of the input buffer in number of characters. 238 * @deprecated Use {@link ReaderInputStream#builder()} instead 239 */ 240 @Deprecated 241 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 242 // @formatter:off 243 this(reader, 244 Charsets.toCharset(charset).newEncoder() 245 .onMalformedInput(CodingErrorAction.REPLACE) 246 .onUnmappableCharacter(CodingErrorAction.REPLACE), 247 bufferSize); 248 // @formatter:on 249 } 250 251 /** 252 * Constructs a new {@link ReaderInputStream}. 253 * 254 * <p> 255 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 256 * an encoder which had already been in use. 257 * </p> 258 * 259 * @param reader the target {@link Reader} 260 * @param charsetEncoder the charset encoder 261 * @since 2.1 262 * @deprecated Use {@link ReaderInputStream#builder()} instead 263 */ 264 @Deprecated 265 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 266 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 267 } 268 269 /** 270 * Constructs a new {@link ReaderInputStream}. 271 * 272 * <p> 273 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 274 * an encoder which had already been in use. 275 * </p> 276 * 277 * @param reader the target {@link Reader} 278 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 279 * @param bufferSize the size of the input buffer in number of characters 280 * @since 2.1 281 * @deprecated Use {@link ReaderInputStream#builder()} instead 282 */ 283 @Deprecated 284 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 285 this.reader = reader; 286 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 287 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 288 this.encoderIn.flip(); 289 this.encoderOut = ByteBuffer.allocate(128); 290 this.encoderOut.flip(); 291 } 292 293 /** 294 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 295 * 296 * <p> 297 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 298 * </p> 299 * 300 * @param reader the target {@link Reader} 301 * @param charsetName the name of the charset encoding 302 * @deprecated Use {@link ReaderInputStream#builder()} instead 303 */ 304 @Deprecated 305 public ReaderInputStream(final Reader reader, final String charsetName) { 306 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 307 } 308 309 /** 310 * Constructs a new {@link ReaderInputStream}. 311 * 312 * <p> 313 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 314 * </p> 315 * 316 * @param reader the target {@link Reader} 317 * @param charsetName the name of the charset encoding, null maps to the default Charset. 318 * @param bufferSize the size of the input buffer in number of characters 319 * @deprecated Use {@link ReaderInputStream#builder()} instead 320 */ 321 @Deprecated 322 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 323 this(reader, Charsets.toCharset(charsetName), bufferSize); 324 } 325 326 /** 327 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 328 * 329 * @throws IOException if an I/O error occurs. 330 */ 331 @Override 332 public void close() throws IOException { 333 reader.close(); 334 } 335 336 /** 337 * Fills the internal char buffer from the reader. 338 * 339 * @throws IOException If an I/O error occurs 340 */ 341 private void fillBuffer() throws IOException { 342 if (endOfInput) { 343 return; 344 } 345 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 346 encoderIn.compact(); 347 final int position = encoderIn.position(); 348 // We don't use Reader#read(CharBuffer) here because it is more efficient 349 // to write directly to the underlying char array (the default implementation 350 // copies data to a temporary char array). 351 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 352 if (c == EOF) { 353 endOfInput = true; 354 } else { 355 encoderIn.position(position + c); 356 } 357 encoderIn.flip(); 358 } 359 encoderOut.compact(); 360 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 361 if (endOfInput) { 362 lastCoderResult = charsetEncoder.flush(encoderOut); 363 } 364 if (lastCoderResult.isError()) { 365 lastCoderResult.throwException(); 366 } 367 encoderOut.flip(); 368 } 369 370 /** 371 * Gets the CharsetEncoder. 372 * 373 * @return the CharsetEncoder. 374 */ 375 CharsetEncoder getCharsetEncoder() { 376 return charsetEncoder; 377 } 378 379 /** 380 * Reads a single byte. 381 * 382 * @return either the byte read or {@code -1} if the end of the stream has been reached 383 * @throws IOException if an I/O error occurs. 384 */ 385 @Override 386 public int read() throws IOException { 387 for (;;) { 388 if (encoderOut.hasRemaining()) { 389 return encoderOut.get() & 0xFF; 390 } 391 fillBuffer(); 392 if (endOfInput && !encoderOut.hasRemaining()) { 393 return EOF; 394 } 395 } 396 } 397 398 /** 399 * Reads the specified number of bytes into an array. 400 * 401 * @param b the byte array to read into 402 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 403 * @throws IOException if an I/O error occurs. 404 */ 405 @Override 406 public int read(final byte[] b) throws IOException { 407 return read(b, 0, b.length); 408 } 409 410 /** 411 * Reads the specified number of bytes into an array. 412 * 413 * @param array the byte array to read into 414 * @param off the offset to start reading bytes into 415 * @param len the number of bytes to read 416 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 417 * @throws IOException if an I/O error occurs. 418 */ 419 @Override 420 public int read(final byte[] array, int off, int len) throws IOException { 421 Objects.requireNonNull(array, "array"); 422 if (len < 0 || off < 0 || off + len > array.length) { 423 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 424 } 425 int read = 0; 426 if (len == 0) { 427 return 0; // Always return 0 if len == 0 428 } 429 while (len > 0) { 430 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 431 final int c = Math.min(encoderOut.remaining(), len); 432 encoderOut.get(array, off, c); 433 off += c; 434 len -= c; 435 read += c; 436 } else if (endOfInput) { // Already reach EOF in the last read 437 break; 438 } else { // Read again 439 fillBuffer(); 440 } 441 } 442 return read == 0 && endOfInput ? EOF : read; 443 } 444 }