ExtendedBufferedReader.java
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.csv;
import static org.apache.commons.csv.Constants.CR;
import static org.apache.commons.csv.Constants.LF;
import static org.apache.commons.csv.Constants.UNDEFINED;
import static org.apache.commons.io.IOUtils.EOF;
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
/**
* A special buffered reader which supports sophisticated read access.
* <p>
* In particular the reader supports a look-ahead option, which allows you to see the next char returned by
* {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
* </p>
*/
final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
/** The last char returned */
private int lastChar = UNDEFINED;
private int lastCharMark = UNDEFINED;
/** The count of EOLs (CR/LF/CRLF) seen so far */
private long lineNumber;
private long lineNumberMark;
/** The position, which is the number of characters read so far */
private long position;
private long positionMark;
/** The number of bytes read so far. */
private long bytesRead;
private long bytesReadMark;
/** Encoder for calculating the number of bytes for each character read. */
private final CharsetEncoder encoder;
/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
this(reader, null, false);
}
/**
* Constructs a new instance with the specified reader, character set,
* and byte tracking option. Initializes an encoder if byte tracking is enabled
* and a character set is provided.
*
* @param reader the reader supports a look-ahead option.
* @param charset the character set for encoding, or {@code null} if not applicable.
* @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
*/
ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) {
super(reader);
encoder = charset != null && trackBytes ? charset.newEncoder() : null;
}
/**
* Closes the stream.
*
* @throws IOException
* If an I/O error occurs
*/
@Override
public void close() throws IOException {
// Set ivars before calling super close() in case close() throws an IOException.
lastChar = EOF;
super.close();
}
/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}
/**
* Gets the byte length of the given character based on the the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* <p>
* The Unicode characters are divided into two main ranges:
* <ul>
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
* <ul>
* <li>Represented using a single 16-bit {@code char}.</li>
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
* </ul>
* </li>
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
* <ul>
* <li>Represented as a pair of {@code char}s:</li>
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
* </ul>
* </li>
* </ul>
*
* @param current the current character to process.
* @return the byte length of the character.
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(final int current) throws CharacterCodingException {
final char cChar = (char) current;
final char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
}
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { lChar, cChar })).limit();
} else {
throw new CharacterCodingException();
}
}
/**
* Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by
* any of the read methods. This will not include a character read using the {@link #peek()} method. If no
* character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached
* on the last read then this will return {@link IOUtils#EOF}.
*
* @return the last character that was read
*/
int getLastChar() {
return lastChar;
}
/**
* Returns the current line number
*
* @return the current line number
*/
long getLineNumber() {
// Check if we are at EOL or EOF or just starting
if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == EOF) {
return lineNumber; // counter is accurate
}
return lineNumber + 1; // Allow for counter being incremented only at EOL
}
/**
* Gets the character position in the reader.
*
* @return the current position in the reader (counting characters, not bytes since this is a Reader)
*/
long getPosition() {
return this.position;
}
@Override
public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}
@Override
public int read() throws IOException {
final int current = super.read();
if (current == CR || current == LF && lastChar != CR ||
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
this.bytesRead += getEncodedCharLength(current);
}
lastChar = current;
position++;
return lastChar;
}
@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
return 0;
}
final int len = super.read(buf, offset, length);
if (len > 0) {
for (int i = offset; i < offset + len; i++) {
final char ch = buf[i];
if (ch == LF) {
if (CR != (i > offset ? buf[i - 1] : lastChar)) {
lineNumber++;
}
} else if (ch == CR) {
lineNumber++;
}
}
lastChar = buf[offset + len - 1];
} else if (len == EOF) {
lastChar = EOF;
}
position += len;
return len;
}
/**
* Gets the next line, dropping the line terminator(s). This method should only be called when processing a
* comment, otherwise, information can be lost.
* <p>
* Increments {@link #lineNumber} and updates {@link #position}.
* </p>
* <p>
* Sets {@link #lastChar} to {@code Constants.EOF} at EOF, otherwise the last EOL character.
* </p>
*
* @return the line that was read, or null if reached EOF.
*/
@Override
public String readLine() throws IOException {
if (peek() == EOF) {
return null;
}
final StringBuilder buffer = new StringBuilder();
while (true) {
final int current = read();
if (current == CR) {
final int next = peek();
if (next == LF) {
read();
}
}
if (current == EOF || current == LF || current == CR) {
break;
}
buffer.append((char) current);
}
return buffer.toString();
}
@Override
public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}
}