CharSequenceInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.io.input;

  18. import static org.apache.commons.io.IOUtils.EOF;

  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.nio.ByteBuffer;
  22. import java.nio.CharBuffer;
  23. import java.nio.charset.CharacterCodingException;
  24. import java.nio.charset.Charset;
  25. import java.nio.charset.CharsetEncoder;
  26. import java.nio.charset.CoderResult;
  27. import java.nio.charset.CodingErrorAction;
  28. import java.util.Objects;

  29. import org.apache.commons.io.Charsets;
  30. import org.apache.commons.io.IOUtils;
  31. import org.apache.commons.io.build.AbstractStreamBuilder;
  32. import org.apache.commons.io.charset.CharsetEncoders;
  33. import org.apache.commons.io.function.Uncheck;

  34. /**
  35.  * Implements an {@link InputStream} to read bytes from String, StringBuffer, StringBuilder or CharBuffer,
  36.  * encoded using the specified Charset. The Charset defaults to Charset.defaultCharset().
  37.  * <p>
  38.  * <strong>Note:</strong> Supports {@link #mark(int)} and {@link #reset()}.
  39.  * </p>
  40.  * <p>
  41.  * To build an instance, use {@link Builder}.
  42.  * </p>
  43.  *
  44.  * @see Builder
  45.  * @since 2.2
  46.  */
  47. public class CharSequenceInputStream extends InputStream {

  48.     //@formatter:off
  49.     /**
  50.      * Builds a new {@link CharSequenceInputStream}.
  51.      *
  52.      * <p>
  53.      * For example:
  54.      * </p>
  55.      * <h2>Using a Charset</h2>
  56.      * <pre>{@code
  57.      * CharSequenceInputStream s = CharSequenceInputStream.builder()
  58.      *   .setBufferSize(8192)
  59.      *   .setCharSequence("String")
  60.      *   .setCharset(Charset.defaultCharset())
  61.      *   .get();}
  62.      * </pre>
  63.      * <h2>Using a CharsetEncoder</h2>
  64.      * <pre>{@code
  65.      * CharSequenceInputStream s = CharSequenceInputStream.builder()
  66.      *   .setBufferSize(8192)
  67.      *   .setCharSequence("String")
  68.      *   .setCharsetEncoder(Charset.defaultCharset().newEncoder()
  69.      *     .onMalformedInput(CodingErrorAction.REPLACE)
  70.      *     .onUnmappableCharacter(CodingErrorAction.REPLACE))
  71.      *   .get();}
  72.      * </pre>
  73.      *
  74.      * @see #get()
  75.      * @since 2.13.0
  76.      */
  77.     //@formatter:on
  78.     public static class Builder extends AbstractStreamBuilder<CharSequenceInputStream, Builder> {

  79.         private CharsetEncoder charsetEncoder = newEncoder(getCharset());

  80.         /**
  81.          * Constructs a new builder of {@link CharSequenceInputStream}.
  82.          */
  83.         public Builder() {
  84.             // empty
  85.         }

  86.         /**
  87.          * Builds a new {@link CharSequenceInputStream}.
  88.          * <p>
  89.          * You must set an aspect that supports {@link #getCharSequence()}, otherwise, this method throws an exception.
  90.          * </p>
  91.          * <p>
  92.          * This builder uses the following aspects:
  93.          * </p>
  94.          * <ul>
  95.          * <li>{@link #getCharSequence()} gets the target aspect.</li>
  96.          * <li>{@link #getBufferSize()}</li>
  97.          * <li>{@link CharsetEncoder}</li>
  98.          * </ul>
  99.          *
  100.          * @return a new instance.
  101.          * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  102.          * @see #getUnchecked()
  103.          */
  104.         @Override
  105.         public CharSequenceInputStream get() {
  106.             return Uncheck.get(() -> new CharSequenceInputStream(getCharSequence(), getBufferSize(), charsetEncoder));
  107.         }

  108.         CharsetEncoder getCharsetEncoder() {
  109.             return charsetEncoder;
  110.         }

  111.         @Override
  112.         public Builder setCharset(final Charset charset) {
  113.             super.setCharset(charset);
  114.             charsetEncoder = newEncoder(getCharset());
  115.             return this;
  116.         }

  117.         /**
  118.          * Sets the charset encoder. Assumes that the caller has configured the encoder.
  119.          *
  120.          * @param newEncoder the charset encoder.
  121.          * @return {@code this} instance.
  122.          * @since 2.13.0
  123.          */
  124.         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
  125.             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
  126.             super.setCharset(charsetEncoder.charset());
  127.             return this;
  128.         }

  129.     }

  130.     private static final int NO_MARK = -1;

  131.     /**
  132.      * Constructs a new {@link Builder}.
  133.      *
  134.      * @return a new {@link Builder}.
  135.      * @since 2.12.0
  136.      */
  137.     public static Builder builder() {
  138.         return new Builder();
  139.     }

  140.     private static CharsetEncoder newEncoder(final Charset charset) {
  141.         // @formatter:off
  142.         return Charsets.toCharset(charset).newEncoder()
  143.                 .onMalformedInput(CodingErrorAction.REPLACE)
  144.                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
  145.         // @formatter:on
  146.     }

  147.     private final ByteBuffer bBuf;
  148.     private int bBufMark; // position in bBuf
  149.     private final CharBuffer cBuf;
  150.     private int cBufMark; // position in cBuf
  151.     private final CharsetEncoder charsetEncoder;

  152.     /**
  153.      * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
  154.      *
  155.      * @param cs the input character sequence.
  156.      * @param charset the character set name to use.
  157.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  158.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  159.      */
  160.     @Deprecated
  161.     public CharSequenceInputStream(final CharSequence cs, final Charset charset) {
  162.         this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
  163.     }

  164.     /**
  165.      * Constructs a new instance.
  166.      *
  167.      * @param cs the input character sequence.
  168.      * @param charset the character set name to use, null maps to the default Charset.
  169.      * @param bufferSize the buffer size to use.
  170.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  171.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  172.      */
  173.     @Deprecated
  174.     public CharSequenceInputStream(final CharSequence cs, final Charset charset, final int bufferSize) {
  175.         // @formatter:off
  176.         this(cs, bufferSize, newEncoder(charset));
  177.         // @formatter:on
  178.     }

  179.     private CharSequenceInputStream(final CharSequence cs, final int bufferSize, final CharsetEncoder charsetEncoder) {
  180.         this.charsetEncoder = charsetEncoder;
  181.         // Ensure that buffer is long enough to hold a complete character
  182.         this.bBuf = ByteBuffer.allocate(ReaderInputStream.checkMinBufferSize(charsetEncoder, bufferSize));
  183.         this.bBuf.flip();
  184.         this.cBuf = CharBuffer.wrap(cs);
  185.         this.cBufMark = NO_MARK;
  186.         this.bBufMark = NO_MARK;
  187.         try {
  188.             fillBuffer();
  189.         } catch (final CharacterCodingException ex) {
  190.             // Reset everything without filling the buffer
  191.             // so the same exception can be thrown again later.
  192.             this.bBuf.clear();
  193.             this.bBuf.flip();
  194.             this.cBuf.rewind();
  195.         }
  196.     }

  197.     /**
  198.      * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
  199.      *
  200.      * @param cs the input character sequence.
  201.      * @param charset the character set name to use.
  202.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  203.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  204.      */
  205.     @Deprecated
  206.     public CharSequenceInputStream(final CharSequence cs, final String charset) {
  207.         this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
  208.     }

  209.     /**
  210.      * Constructs a new instance.
  211.      *
  212.      * @param cs the input character sequence.
  213.      * @param charset the character set name to use, null maps to the default Charset.
  214.      * @param bufferSize the buffer size to use.
  215.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  216.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  217.      */
  218.     @Deprecated
  219.     public CharSequenceInputStream(final CharSequence cs, final String charset, final int bufferSize) {
  220.         this(cs, Charsets.toCharset(charset), bufferSize);
  221.     }

  222.     /**
  223.      * Gets a lower bound on the number of bytes remaining in the byte stream.
  224.      *
  225.      * @return the count of bytes that can be read without blocking (or returning EOF).
  226.      * @throws IOException if an error occurs (probably not possible).
  227.      */
  228.     @Override
  229.     public int available() throws IOException {
  230.         return this.bBuf.remaining();
  231.     }

  232.     @Override
  233.     public void close() throws IOException {
  234.         bBuf.position(bBuf.limit());
  235.     }

  236.     /**
  237.      * Fills the byte output buffer from the input char buffer.
  238.      *
  239.      * @throws CharacterCodingException
  240.      *             an error encoding data.
  241.      */
  242.     private void fillBuffer() throws CharacterCodingException {
  243.         this.bBuf.compact();
  244.         final CoderResult result = this.charsetEncoder.encode(this.cBuf, this.bBuf, true);
  245.         if (result.isError()) {
  246.             result.throwException();
  247.         }
  248.         this.bBuf.flip();
  249.     }

  250.     /**
  251.      * Gets the CharsetEncoder.
  252.      *
  253.      * @return the CharsetEncoder.
  254.      */
  255.     CharsetEncoder getCharsetEncoder() {
  256.         return charsetEncoder;
  257.     }

  258.     /**
  259.      * {@inheritDoc}
  260.      * @param readLimit max read limit (ignored).
  261.      */
  262.     @Override
  263.     public synchronized void mark(final int readLimit) {
  264.         this.cBufMark = this.cBuf.position();
  265.         this.bBufMark = this.bBuf.position();
  266.         this.cBuf.mark();
  267.         this.bBuf.mark();
  268.         // It would be nice to be able to use mark & reset on the cBuf and bBuf;
  269.         // however the bBuf is re-used so that won't work
  270.     }

  271.     @Override
  272.     public boolean markSupported() {
  273.         return true;
  274.     }

  275.     @Override
  276.     public int read() throws IOException {
  277.         for (;;) {
  278.             if (this.bBuf.hasRemaining()) {
  279.                 return this.bBuf.get() & 0xFF;
  280.             }
  281.             fillBuffer();
  282.             if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
  283.                 return EOF;
  284.             }
  285.         }
  286.     }

  287.     @Override
  288.     public int read(final byte[] b) throws IOException {
  289.         return read(b, 0, b.length);
  290.     }

  291.     @Override
  292.     public int read(final byte[] array, int off, int len) throws IOException {
  293.         Objects.requireNonNull(array, "array");
  294.         if (len < 0 || off + len > array.length) {
  295.             throw new IndexOutOfBoundsException("Array Size=" + array.length + ", offset=" + off + ", length=" + len);
  296.         }
  297.         if (len == 0) {
  298.             return 0; // must return 0 for zero length read
  299.         }
  300.         if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
  301.             return EOF;
  302.         }
  303.         int bytesRead = 0;
  304.         while (len > 0) {
  305.             if (this.bBuf.hasRemaining()) {
  306.                 final int chunk = Math.min(this.bBuf.remaining(), len);
  307.                 this.bBuf.get(array, off, chunk);
  308.                 off += chunk;
  309.                 len -= chunk;
  310.                 bytesRead += chunk;
  311.             } else {
  312.                 fillBuffer();
  313.                 if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
  314.                     break;
  315.                 }
  316.             }
  317.         }
  318.         return bytesRead == 0 && !this.cBuf.hasRemaining() ? EOF : bytesRead;
  319.     }

  320.     @Override
  321.     public synchronized void reset() throws IOException {
  322.         //
  323.         // This is not the most efficient implementation, as it re-encodes from the beginning.
  324.         //
  325.         // Since the bBuf is re-used, in general it's necessary to re-encode the data.
  326.         //
  327.         // It should be possible to apply some optimizations however:
  328.         // + use mark/reset on the cBuf and bBuf. This would only work if the buffer had not been (re)filled since
  329.         // the mark. The code would have to catch InvalidMarkException - does not seem possible to check if mark is
  330.         // valid otherwise. + Try saving the state of the cBuf before each fillBuffer; it might be possible to
  331.         // restart from there.
  332.         //
  333.         if (this.cBufMark != NO_MARK) {
  334.             // if cBuf is at 0, we have not started reading anything, so skip re-encoding
  335.             if (this.cBuf.position() != 0) {
  336.                 this.charsetEncoder.reset();
  337.                 this.cBuf.rewind();
  338.                 this.bBuf.rewind();
  339.                 this.bBuf.limit(0); // rewind does not clear the buffer
  340.                 while (this.cBuf.position() < this.cBufMark) {
  341.                     this.bBuf.rewind(); // empty the buffer (we only refill when empty during normal processing)
  342.                     this.bBuf.limit(0);
  343.                     fillBuffer();
  344.                 }
  345.             }
  346.             if (this.cBuf.position() != this.cBufMark) {
  347.                 throw new IllegalStateException("Unexpected CharBuffer position: actual=" + cBuf.position() + " " +
  348.                         "expected=" + this.cBufMark);
  349.             }
  350.             this.bBuf.position(this.bBufMark);
  351.             this.cBufMark = NO_MARK;
  352.             this.bBufMark = NO_MARK;
  353.         }
  354.         mark(0);
  355.     }

  356.     @Override
  357.     public long skip(long n) throws IOException {
  358.         //
  359.         // This could be made more efficient by using position to skip within the current buffer.
  360.         //
  361.         long skipped = 0;
  362.         while (n > 0 && available() > 0) {
  363.             this.read();
  364.             n--;
  365.             skipped++;
  366.         }
  367.         return skipped;
  368.     }

  369. }