ExtendedBufferedReader.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  *   https://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */

  19. package org.apache.commons.csv;

  20. import static org.apache.commons.csv.Constants.CR;
  21. import static org.apache.commons.csv.Constants.LF;
  22. import static org.apache.commons.csv.Constants.UNDEFINED;
  23. import static org.apache.commons.io.IOUtils.EOF;

  24. import java.io.IOException;
  25. import java.io.Reader;
  26. import java.nio.CharBuffer;
  27. import java.nio.charset.CharacterCodingException;
  28. import java.nio.charset.Charset;
  29. import java.nio.charset.CharsetEncoder;

  30. import org.apache.commons.io.IOUtils;
  31. import org.apache.commons.io.input.UnsynchronizedBufferedReader;

  32. /**
  33.  * A special buffered reader which supports sophisticated read access.
  34.  * <p>
  35.  * In particular the reader supports a look-ahead option, which allows you to see the next char returned by
  36.  * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
  37.  * </p>
  38.  */
  39. final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {

  40.     /** The last char returned */
  41.     private int lastChar = UNDEFINED;
  42.     private int lastCharMark = UNDEFINED;

  43.     /** The count of EOLs (CR/LF/CRLF) seen so far */
  44.     private long lineNumber;
  45.     private long lineNumberMark;

  46.     /** The position, which is the number of characters read so far */
  47.     private long position;
  48.     private long positionMark;

  49.     /** The number of bytes read so far. */
  50.     private long bytesRead;
  51.     private long bytesReadMark;

  52.     /** Encoder for calculating the number of bytes for each character read. */
  53.     private final CharsetEncoder encoder;

  54.     /**
  55.      * Constructs a new instance using the default buffer size.
  56.      */
  57.     ExtendedBufferedReader(final Reader reader) {
  58.         this(reader, null, false);
  59.     }

  60.     /**
  61.      * Constructs a new instance with the specified reader, character set,
  62.      * and byte tracking option. Initializes an encoder if byte tracking is enabled
  63.      * and a character set is provided.
  64.      *
  65.      * @param reader the reader supports a look-ahead option.
  66.      * @param charset the character set for encoding, or {@code null} if not applicable.
  67.      * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
  68.      */
  69.     ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) {
  70.         super(reader);
  71.         encoder = charset != null && trackBytes ? charset.newEncoder() : null;
  72.     }

  73.     /**
  74.      * Closes the stream.
  75.      *
  76.      * @throws IOException
  77.      *             If an I/O error occurs
  78.      */
  79.     @Override
  80.     public void close() throws IOException {
  81.         // Set ivars before calling super close() in case close() throws an IOException.
  82.         lastChar = EOF;
  83.         super.close();
  84.     }

  85.     /**
  86.      * Gets the number of bytes read by the reader.
  87.      *
  88.      * @return the number of bytes read by the read
  89.      */
  90.     long getBytesRead() {
  91.         return this.bytesRead;
  92.     }

  93.     /**
  94.      * Gets the byte length of the given character based on the the original Unicode
  95.      * specification, which defined characters as fixed-width 16-bit entities.
  96.      * <p>
  97.      * The Unicode characters are divided into two main ranges:
  98.      * <ul>
  99.      *   <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
  100.      *     <ul>
  101.      *       <li>Represented using a single 16-bit {@code char}.</li>
  102.      *       <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
  103.      *     </ul>
  104.      *   </li>
  105.      *   <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
  106.      *     <ul>
  107.      *       <li>Represented as a pair of {@code char}s:</li>
  108.      *       <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
  109.      *       <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
  110.      *       <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
  111.      *     </ul>
  112.      *   </li>
  113.      * </ul>
  114.      *
  115.      * @param current the current character to process.
  116.      * @return the byte length of the character.
  117.      * @throws CharacterCodingException if the character cannot be encoded.
  118.      */
  119.     private int getEncodedCharLength(final int current) throws CharacterCodingException {
  120.         final char cChar = (char) current;
  121.         final char lChar = (char) lastChar;
  122.         if (!Character.isSurrogate(cChar)) {
  123.             return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
  124.         }
  125.         if (Character.isHighSurrogate(cChar)) {
  126.             // Move on to the next char (low surrogate)
  127.             return 0;
  128.         } else if (Character.isSurrogatePair(lChar, cChar)) {
  129.             return encoder.encode(CharBuffer.wrap(new char[] { lChar, cChar })).limit();
  130.         } else {
  131.             throw new CharacterCodingException();
  132.         }
  133.     }

  134.     /**
  135.      * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by
  136.      * any of the read methods. This will not include a character read using the {@link #peek()} method. If no
  137.      * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached
  138.      * on the last read then this will return {@link IOUtils#EOF}.
  139.      *
  140.      * @return the last character that was read
  141.      */
  142.     int getLastChar() {
  143.         return lastChar;
  144.     }

  145.     /**
  146.      * Returns the current line number
  147.      *
  148.      * @return the current line number
  149.      */
  150.     long getLineNumber() {
  151.         // Check if we are at EOL or EOF or just starting
  152.         if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == EOF) {
  153.             return lineNumber; // counter is accurate
  154.         }
  155.         return lineNumber + 1; // Allow for counter being incremented only at EOL
  156.     }

  157.     /**
  158.      * Gets the character position in the reader.
  159.      *
  160.      * @return the current position in the reader (counting characters, not bytes since this is a Reader)
  161.      */
  162.     long getPosition() {
  163.         return this.position;
  164.     }

  165.     @Override
  166.     public void mark(final int readAheadLimit) throws IOException {
  167.         lineNumberMark = lineNumber;
  168.         lastCharMark = lastChar;
  169.         positionMark = position;
  170.         bytesReadMark = bytesRead;
  171.         super.mark(readAheadLimit);
  172.     }

  173.     @Override
  174.     public int read() throws IOException {
  175.         final int current = super.read();
  176.         if (current == CR || current == LF && lastChar != CR ||
  177.             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
  178.             lineNumber++;
  179.         }
  180.         if (encoder != null) {
  181.             this.bytesRead += getEncodedCharLength(current);
  182.         }
  183.         lastChar = current;
  184.         position++;
  185.         return lastChar;
  186.     }

  187.     @Override
  188.     public int read(final char[] buf, final int offset, final int length) throws IOException {
  189.         if (length == 0) {
  190.             return 0;
  191.         }
  192.         final int len = super.read(buf, offset, length);
  193.         if (len > 0) {
  194.             for (int i = offset; i < offset + len; i++) {
  195.                 final char ch = buf[i];
  196.                 if (ch == LF) {
  197.                     if (CR != (i > offset ? buf[i - 1] : lastChar)) {
  198.                         lineNumber++;
  199.                     }
  200.                 } else if (ch == CR) {
  201.                     lineNumber++;
  202.                 }
  203.             }
  204.             lastChar = buf[offset + len - 1];
  205.         } else if (len == EOF) {
  206.             lastChar = EOF;
  207.         }
  208.         position += len;
  209.         return len;
  210.     }

  211.     /**
  212.      * Gets the next line, dropping the line terminator(s). This method should only be called when processing a
  213.      * comment, otherwise, information can be lost.
  214.      * <p>
  215.      * Increments {@link #lineNumber} and updates {@link #position}.
  216.      * </p>
  217.      * <p>
  218.      * Sets {@link #lastChar} to {@code Constants.EOF} at EOF, otherwise the last EOL character.
  219.      * </p>
  220.      *
  221.      * @return the line that was read, or null if reached EOF.
  222.      */
  223.     @Override
  224.     public String readLine() throws IOException {
  225.         if (peek() == EOF) {
  226.             return null;
  227.         }
  228.         final StringBuilder buffer = new StringBuilder();
  229.         while (true) {
  230.             final int current = read();
  231.             if (current == CR) {
  232.                 final int next = peek();
  233.                 if (next == LF) {
  234.                     read();
  235.                 }
  236.             }
  237.             if (current == EOF || current == LF || current == CR) {
  238.                 break;
  239.             }
  240.             buffer.append((char) current);
  241.         }
  242.         return buffer.toString();
  243.     }

  244.     @Override
  245.     public void reset() throws IOException {
  246.         lineNumber = lineNumberMark;
  247.         lastChar = lastCharMark;
  248.         position = positionMark;
  249.         bytesRead = bytesReadMark;
  250.         super.reset();
  251.     }

  252. }