CharSequenceInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.io.input;

  18. import static org.apache.commons.io.IOUtils.EOF;

  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.nio.ByteBuffer;
  22. import java.nio.CharBuffer;
  23. import java.nio.charset.CharacterCodingException;
  24. import java.nio.charset.Charset;
  25. import java.nio.charset.CharsetEncoder;
  26. import java.nio.charset.CoderResult;
  27. import java.nio.charset.CodingErrorAction;
  28. import java.util.Objects;

  29. import org.apache.commons.io.Charsets;
  30. import org.apache.commons.io.IOUtils;
  31. import org.apache.commons.io.build.AbstractStreamBuilder;
  32. import org.apache.commons.io.charset.CharsetEncoders;
  33. import org.apache.commons.io.function.Uncheck;

  34. /**
  35.  * Implements an {@link InputStream} to read bytes from String, StringBuffer, StringBuilder or CharBuffer,
  36.  * encoded using the specified Charset. The Charset defaults to Charset.defaultCharset().
  37.  * <p>
  38.  * <strong>Note:</strong> Supports {@link #mark(int)} and {@link #reset()}.
  39.  * </p>
  40.  * <p>
  41.  * To build an instance, use {@link Builder}.
  42.  * </p>
  43.  *
  44.  * @see Builder
  45.  * @since 2.2
  46.  */
  47. public class CharSequenceInputStream extends InputStream {

  48.     //@formatter:off
  49.     /**
  50.      * Builds a new {@link CharSequenceInputStream}.
  51.      *
  52.      * <p>
  53.      * For example:
  54.      * </p>
  55.      * <h2>Using a Charset</h2>
  56.      * <pre>{@code
  57.      * CharSequenceInputStream s = CharSequenceInputStream.builder()
  58.      *   .setBufferSize(8192)
  59.      *   .setCharSequence("String")
  60.      *   .setCharset(Charset.defaultCharset())
  61.      *   .get();}
  62.      * </pre>
  63.      * <h2>Using a CharsetEncoder</h2>
  64.      * <pre>{@code
  65.      * CharSequenceInputStream s = CharSequenceInputStream.builder()
  66.      *   .setBufferSize(8192)
  67.      *   .setCharSequence("String")
  68.      *   .setCharsetEncoder(Charset.defaultCharset().newEncoder()
  69.      *     .onMalformedInput(CodingErrorAction.REPLACE)
  70.      *     .onUnmappableCharacter(CodingErrorAction.REPLACE))
  71.      *   .get();}
  72.      * </pre>
  73.      *
  74.      * @see #get()
  75.      * @since 2.13.0
  76.      */
  77.     //@formatter:on
  78.     public static class Builder extends AbstractStreamBuilder<CharSequenceInputStream, Builder> {

  79.         private CharsetEncoder charsetEncoder = newEncoder(getCharset());

  80.         /**
  81.          * Constructs a new builder of {@link CharSequenceInputStream}.
  82.          */
  83.         public Builder() {
  84.             // empty
  85.         }

  86.         /**
  87.          * Builds a new {@link CharSequenceInputStream}.
  88.          * <p>
  89.          * You must set an aspect that supports {@link #getCharSequence()}, otherwise, this method throws an exception.
  90.          * </p>
  91.          * <p>
  92.          * This builder uses the following aspects:
  93.          * </p>
  94.          * <ul>
  95.          * <li>{@link #getCharSequence()} gets the target aspect.</li>
  96.          * <li>{@link #getBufferSize()}</li>
  97.          * <li>{@link CharsetEncoder}</li>
  98.          * </ul>
  99.          *
  100.          * @return a new instance.
  101.          * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  102.          * @see #getUnchecked()
  103.          */
  104.         @Override
  105.         public CharSequenceInputStream get() {
  106.             return Uncheck.get(() -> new CharSequenceInputStream(this));
  107.         }

  108.         CharsetEncoder getCharsetEncoder() {
  109.             return charsetEncoder;
  110.         }

  111.         @Override
  112.         public Builder setCharset(final Charset charset) {
  113.             super.setCharset(charset);
  114.             charsetEncoder = newEncoder(getCharset());
  115.             return this;
  116.         }

  117.         /**
  118.          * Sets the charset encoder. Assumes that the caller has configured the encoder.
  119.          *
  120.          * @param newEncoder the charset encoder.
  121.          * @return {@code this} instance.
  122.          * @since 2.13.0
  123.          */
  124.         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
  125.             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
  126.             super.setCharset(charsetEncoder.charset());
  127.             return this;
  128.         }

  129.     }

  130.     private static final int NO_MARK = -1;

  131.     /**
  132.      * Constructs a new {@link Builder}.
  133.      *
  134.      * @return a new {@link Builder}.
  135.      * @since 2.12.0
  136.      */
  137.     public static Builder builder() {
  138.         return new Builder();
  139.     }

  140.     private static CharsetEncoder newEncoder(final Charset charset) {
  141.         // @formatter:off
  142.         return Charsets.toCharset(charset).newEncoder()
  143.                 .onMalformedInput(CodingErrorAction.REPLACE)
  144.                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
  145.         // @formatter:on
  146.     }

  147.     private final ByteBuffer bBuf;
  148.     private int bBufMark; // position in bBuf
  149.     private final CharBuffer cBuf;
  150.     private int cBufMark; // position in cBuf
  151.     private final CharsetEncoder charsetEncoder;

  152.     private CharSequenceInputStream(final Builder builder) {
  153.         this.charsetEncoder = builder.charsetEncoder;
  154.         // Ensure that buffer is long enough to hold a complete character
  155.         this.bBuf = ByteBuffer.allocate(ReaderInputStream.checkMinBufferSize(builder.charsetEncoder, builder.getBufferSize()));
  156.         this.bBuf.flip();
  157.         this.cBuf = CharBuffer.wrap(Uncheck.get(() -> builder.getCharSequence()));
  158.         this.cBufMark = NO_MARK;
  159.         this.bBufMark = NO_MARK;
  160.         try {
  161.             fillBuffer();
  162.         } catch (final CharacterCodingException ex) {
  163.             // Reset everything without filling the buffer
  164.             // so the same exception can be thrown again later.
  165.             this.bBuf.clear();
  166.             this.bBuf.flip();
  167.             this.cBuf.rewind();
  168.         }
  169.     }

  170.     /**
  171.      * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
  172.      *
  173.      * @param cs the input character sequence.
  174.      * @param charset the character set name to use.
  175.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  176.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  177.      */
  178.     @Deprecated
  179.     public CharSequenceInputStream(final CharSequence cs, final Charset charset) {
  180.         this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
  181.     }

  182.     /**
  183.      * Constructs a new instance.
  184.      *
  185.      * @param cs the input character sequence.
  186.      * @param charset the character set name to use, null maps to the default Charset.
  187.      * @param bufferSize the buffer size to use.
  188.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  189.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  190.      */
  191.     @Deprecated
  192.     public CharSequenceInputStream(final CharSequence cs, final Charset charset, final int bufferSize) {
  193.         this(builder().setCharSequence(cs).setCharset(charset).setBufferSize(bufferSize));
  194.     }

  195.     /**
  196.      * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
  197.      *
  198.      * @param cs the input character sequence.
  199.      * @param charset the character set name to use.
  200.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  201.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  202.      */
  203.     @Deprecated
  204.     public CharSequenceInputStream(final CharSequence cs, final String charset) {
  205.         this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
  206.     }

  207.     /**
  208.      * Constructs a new instance.
  209.      *
  210.      * @param cs the input character sequence.
  211.      * @param charset the character set name to use, null maps to the default Charset.
  212.      * @param bufferSize the buffer size to use.
  213.      * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
  214.      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
  215.      */
  216.     @Deprecated
  217.     public CharSequenceInputStream(final CharSequence cs, final String charset, final int bufferSize) {
  218.         this(cs, Charsets.toCharset(charset), bufferSize);
  219.     }

  220.     /**
  221.      * Gets a lower bound on the number of bytes remaining in the byte stream.
  222.      *
  223.      * @return the count of bytes that can be read without blocking (or returning EOF).
  224.      * @throws IOException if an error occurs (probably not possible).
  225.      */
  226.     @Override
  227.     public int available() throws IOException {
  228.         return this.bBuf.remaining();
  229.     }

  230.     @Override
  231.     public void close() throws IOException {
  232.         bBuf.position(bBuf.limit());
  233.     }

  234.     /**
  235.      * Fills the byte output buffer from the input char buffer.
  236.      *
  237.      * @throws CharacterCodingException
  238.      *             an error encoding data.
  239.      */
  240.     private void fillBuffer() throws CharacterCodingException {
  241.         this.bBuf.compact();
  242.         final CoderResult result = this.charsetEncoder.encode(this.cBuf, this.bBuf, true);
  243.         if (result.isError()) {
  244.             result.throwException();
  245.         }
  246.         this.bBuf.flip();
  247.     }

  248.     /**
  249.      * Gets the CharsetEncoder.
  250.      *
  251.      * @return the CharsetEncoder.
  252.      */
  253.     CharsetEncoder getCharsetEncoder() {
  254.         return charsetEncoder;
  255.     }

  256.     /**
  257.      * {@inheritDoc}
  258.      * @param readLimit max read limit (ignored).
  259.      */
  260.     @Override
  261.     public synchronized void mark(final int readLimit) {
  262.         this.cBufMark = this.cBuf.position();
  263.         this.bBufMark = this.bBuf.position();
  264.         this.cBuf.mark();
  265.         this.bBuf.mark();
  266.         // It would be nice to be able to use mark & reset on the cBuf and bBuf;
  267.         // however the bBuf is re-used so that won't work
  268.     }

  269.     @Override
  270.     public boolean markSupported() {
  271.         return true;
  272.     }

  273.     @Override
  274.     public int read() throws IOException {
  275.         for (;;) {
  276.             if (this.bBuf.hasRemaining()) {
  277.                 return this.bBuf.get() & 0xFF;
  278.             }
  279.             fillBuffer();
  280.             if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
  281.                 return EOF;
  282.             }
  283.         }
  284.     }

  285.     @Override
  286.     public int read(final byte[] b) throws IOException {
  287.         return read(b, 0, b.length);
  288.     }

  289.     @Override
  290.     public int read(final byte[] array, int off, int len) throws IOException {
  291.         Objects.requireNonNull(array, "array");
  292.         if (len < 0 || off + len > array.length) {
  293.             throw new IndexOutOfBoundsException("Array Size=" + array.length + ", offset=" + off + ", length=" + len);
  294.         }
  295.         if (len == 0) {
  296.             return 0; // must return 0 for zero length read
  297.         }
  298.         if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
  299.             return EOF;
  300.         }
  301.         int bytesRead = 0;
  302.         while (len > 0) {
  303.             if (this.bBuf.hasRemaining()) {
  304.                 final int chunk = Math.min(this.bBuf.remaining(), len);
  305.                 this.bBuf.get(array, off, chunk);
  306.                 off += chunk;
  307.                 len -= chunk;
  308.                 bytesRead += chunk;
  309.             } else {
  310.                 fillBuffer();
  311.                 if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
  312.                     break;
  313.                 }
  314.             }
  315.         }
  316.         return bytesRead == 0 && !this.cBuf.hasRemaining() ? EOF : bytesRead;
  317.     }

  318.     @Override
  319.     public synchronized void reset() throws IOException {
  320.         //
  321.         // This is not the most efficient implementation, as it re-encodes from the beginning.
  322.         //
  323.         // Since the bBuf is re-used, in general it's necessary to re-encode the data.
  324.         //
  325.         // It should be possible to apply some optimizations however:
  326.         // + use mark/reset on the cBuf and bBuf. This would only work if the buffer had not been (re)filled since
  327.         // the mark. The code would have to catch InvalidMarkException - does not seem possible to check if mark is
  328.         // valid otherwise. + Try saving the state of the cBuf before each fillBuffer; it might be possible to
  329.         // restart from there.
  330.         //
  331.         if (this.cBufMark != NO_MARK) {
  332.             // if cBuf is at 0, we have not started reading anything, so skip re-encoding
  333.             if (this.cBuf.position() != 0) {
  334.                 this.charsetEncoder.reset();
  335.                 this.cBuf.rewind();
  336.                 this.bBuf.rewind();
  337.                 this.bBuf.limit(0); // rewind does not clear the buffer
  338.                 while (this.cBuf.position() < this.cBufMark) {
  339.                     this.bBuf.rewind(); // empty the buffer (we only refill when empty during normal processing)
  340.                     this.bBuf.limit(0);
  341.                     fillBuffer();
  342.                 }
  343.             }
  344.             if (this.cBuf.position() != this.cBufMark) {
  345.                 throw new IllegalStateException("Unexpected CharBuffer position: actual=" + cBuf.position() + " " +
  346.                         "expected=" + this.cBufMark);
  347.             }
  348.             this.bBuf.position(this.bBufMark);
  349.             this.cBufMark = NO_MARK;
  350.             this.bBufMark = NO_MARK;
  351.         }
  352.         mark(0);
  353.     }

  354.     @Override
  355.     public long skip(long n) throws IOException {
  356.         //
  357.         // This could be made more efficient by using position to skip within the current buffer.
  358.         //
  359.         long skipped = 0;
  360.         while (n > 0 && available() > 0) {
  361.             this.read();
  362.             n--;
  363.             skipped++;
  364.         }
  365.         return skipped;
  366.     }

  367. }