Lexer.java
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.csv;
import static org.apache.commons.io.IOUtils.EOF;
import java.io.Closeable;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
/**
* Lexical analyzer.
*/
final class Lexer implements Closeable {
private static final String CR_STRING = Character.toString(Constants.CR);
private static final String LF_STRING = Character.toString(Constants.LF);
private final char[] delimiter;
private final char[] delimiterBuf;
private final char[] escapeDelimiterBuf;
private final int escape;
private final int quoteChar;
private final int commentStart;
private final boolean ignoreSurroundingSpaces;
private final boolean ignoreEmptyLines;
private final boolean lenientEof;
private final boolean trailingData;
/** The buffered reader. */
private final ExtendedBufferedReader reader;
private String firstEol;
private boolean isLastTokenDelimiter;
Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
this.reader = reader;
this.delimiter = format.getDelimiterCharArray();
this.escape = nullToDisabled(format.getEscapeCharacter());
this.quoteChar = nullToDisabled(format.getQuoteCharacter());
this.commentStart = nullToDisabled(format.getCommentMarker());
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
this.lenientEof = format.getLenientEof();
this.trailingData = format.getTrailingData();
this.delimiterBuf = new char[delimiter.length - 1];
this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
}
/**
* Appends the next escaped character to the token's content.
*
* @param token the current token
* @throws IOException on stream access error
* @throws CSVException Thrown on invalid input.
*/
private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
if (isEscapeDelimiter()) {
token.content.append(delimiter);
} else {
final int unescaped = readEscape();
if (unescaped == EOF) { // unexpected char after escape
token.content.append((char) escape).append((char) reader.getLastChar());
} else {
token.content.append((char) unescaped);
}
}
}
/**
* Closes resources.
*
* @throws IOException
* If an I/O error occurs
*/
@Override
public void close() throws IOException {
reader.close();
}
/**
* Gets the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}
/**
* Returns the current character position
*
* @return the current character position
*/
long getCharacterPosition() {
return reader.getPosition();
}
/**
* Returns the current line number
*
* @return the current line number
*/
long getCurrentLineNumber() {
return reader.getLineNumber();
}
String getFirstEol() {
return firstEol;
}
boolean isClosed() {
return reader.isClosed();
}
boolean isCommentStart(final int ch) {
return ch == commentStart;
}
/**
* Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
*
* @param ch
* the current character.
* @return true if the next characters constitute a delimiter.
* @throws IOException If an I/O error occurs.
*/
boolean isDelimiter(final int ch) throws IOException {
isLastTokenDelimiter = false;
if (ch != delimiter[0]) {
return false;
}
if (delimiter.length == 1) {
isLastTokenDelimiter = true;
return true;
}
reader.peek(delimiterBuf);
for (int i = 0; i < delimiterBuf.length; i++) {
if (delimiterBuf[i] != delimiter[i + 1]) {
return false;
}
}
final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
isLastTokenDelimiter = count != EOF;
return isLastTokenDelimiter;
}
/**
* Tests if the given character indicates the end of the file.
*
* @return true if the given character indicates the end of the file.
*/
boolean isEndOfFile(final int ch) {
return ch == EOF;
}
/**
* Tests if the given character is the escape character.
*
* @return true if the given character is the escape character.
*/
boolean isEscape(final int ch) {
return ch == escape;
}
/**
* Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
*
* For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
*
* @return true if the next characters constitute an escape delimiter.
* @throws IOException If an I/O error occurs.
*/
boolean isEscapeDelimiter() throws IOException {
reader.peek(escapeDelimiterBuf);
if (escapeDelimiterBuf[0] != delimiter[0]) {
return false;
}
for (int i = 1; i < delimiter.length; i++) {
if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
return false;
}
}
final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
return count != EOF;
}
private boolean isMetaChar(final int ch) {
return ch == escape || ch == quoteChar || ch == commentStart;
}
boolean isQuoteChar(final int ch) {
return ch == quoteChar;
}
/**
* Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
*
* @param ch the character to check
* @return true if the character is at the start of a line.
*/
boolean isStartOfLine(final int ch) {
return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
}
/**
* Returns the next token.
* <p>
* A token corresponds to a term, a record change or an end-of-file indicator.
* </p>
*
* @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
* @return the next token found.
* @throws IOException on stream access error.
* @throws CSVException Thrown on invalid input.
*/
Token nextToken(final Token token) throws IOException {
// Get the last read char (required for empty line detection)
int lastChar = reader.getLastChar();
// read the next char and set eol
int c = reader.read();
// Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
boolean eol = readEndOfLine(c);
// empty line detection: eol AND (last char was EOL or beginning)
if (ignoreEmptyLines) {
while (eol && isStartOfLine(lastChar)) {
// Go on char ahead ...
lastChar = c;
c = reader.read();
eol = readEndOfLine(c);
// reached the end of the file without any content (empty line at the end)
if (isEndOfFile(c)) {
token.type = Token.Type.EOF;
// don't set token.isReady here because no content
return token;
}
}
}
// Did we reach EOF during the last iteration already? EOF
if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
token.type = Token.Type.EOF;
// don't set token.isReady here because no content
return token;
}
if (isStartOfLine(lastChar) && isCommentStart(c)) {
final String line = reader.readLine();
if (line == null) {
token.type = Token.Type.EOF;
// don't set token.isReady here because no content
return token;
}
final String comment = line.trim();
token.content.append(comment);
token.type = Token.Type.COMMENT;
return token;
}
// Important: make sure a new char gets consumed in each iteration
while (token.type == Token.Type.INVALID) {
// ignore whitespaces at beginning of a token
if (ignoreSurroundingSpaces) {
while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
c = reader.read();
eol = readEndOfLine(c);
}
}
// ok, start of token reached: encapsulated, or token
if (isDelimiter(c)) {
// empty token return TOKEN("")
token.type = Token.Type.TOKEN;
} else if (eol) {
// empty token return EORECORD("")
// noop: token.content.append("");
token.type = Token.Type.EORECORD;
} else if (isQuoteChar(c)) {
// consume encapsulated token
parseEncapsulatedToken(token);
} else if (isEndOfFile(c)) {
// end of file return EOF()
// noop: token.content.append("");
token.type = Token.Type.EOF;
token.isReady = true; // there is data at EOF
} else {
// next token must be a simple token
// add removed blanks when not ignoring whitespace chars...
parseSimpleToken(token, c);
}
}
return token;
}
private int nullToDisabled(final Character c) {
return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
}
/**
* Parses an encapsulated token.
* <p>
* Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
* in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
* an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
* </p>
* <ul>
* <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
* <ul>
* <li>delimiter (TOKEN)</li>
* <li>end of line (EORECORD)</li>
* </ul>
* <li>end of stream has been reached (EOF)</li> </ul>
*
* @param token
* the current token
* @return a valid token object
* @throws IOException
* Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
* delimiter or EOL.
* @throws CSVException Thrown on invalid input.
*/
private Token parseEncapsulatedToken(final Token token) throws IOException {
token.isQuoted = true;
// Save current line number in case needed for IOE
final long startLineNumber = getCurrentLineNumber();
int c;
while (true) {
c = reader.read();
if (isQuoteChar(c)) {
if (isQuoteChar(reader.peek())) {
// double or escaped encapsulator -> add single encapsulator to token
c = reader.read();
token.content.append((char) c);
} else {
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
while (true) {
c = reader.read();
if (isDelimiter(c)) {
token.type = Token.Type.TOKEN;
return token;
}
if (isEndOfFile(c)) {
token.type = Token.Type.EOF;
token.isReady = true; // There is data at EOF
return token;
}
if (readEndOfLine(c)) {
token.type = Token.Type.EORECORD;
return token;
}
if (trailingData) {
token.content.append((char) c);
} else if (!Character.isWhitespace((char) c)) {
// error invalid char between token and next delimiter
throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
getCurrentLineNumber(), getCharacterPosition());
}
}
}
} else if (isEscape(c)) {
appendNextEscapedCharacterToToken(token);
} else if (isEndOfFile(c)) {
if (lenientEof) {
token.type = Token.Type.EOF;
token.isReady = true; // There is data at EOF
return token;
}
// error condition (end of file before end of token)
throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
} else {
// consume character
token.content.append((char) c);
}
}
}
/**
* Parses a simple token.
* <p>
* Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
* when one of the following conditions becomes true:
* </p>
* <ul>
* <li>The end of line has been reached (EORECORD)</li>
* <li>The end of stream has been reached (EOF)</li>
* <li>An unescaped delimiter has been reached (TOKEN)</li>
* </ul>
*
* @param token the current token
* @param ch the current character
* @return the filled token
* @throws IOException on stream access error
* @throws CSVException Thrown on invalid input.
*/
private Token parseSimpleToken(final Token token, final int ch) throws IOException {
// Faster to use while(true)+break than while(token.type == INVALID)
int cur = ch;
while (true) {
if (readEndOfLine(cur)) {
token.type = Token.Type.EORECORD;
break;
}
if (isEndOfFile(cur)) {
token.type = Token.Type.EOF;
token.isReady = true; // There is data at EOF
break;
}
if (isDelimiter(cur)) {
token.type = Token.Type.TOKEN;
break;
}
// continue
if (isEscape(cur)) {
appendNextEscapedCharacterToToken(token);
} else {
token.content.append((char) cur);
}
cur = reader.read(); // continue
}
if (ignoreSurroundingSpaces) {
trimTrailingSpaces(token.content);
}
return token;
}
/**
* Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
*
* @return true if the given or next character is a line-terminator
*/
boolean readEndOfLine(final int ch) throws IOException {
// check if we have \r\n...
int cur = ch;
if (cur == Constants.CR && reader.peek() == Constants.LF) {
// note: does not change ch outside of this method!
cur = reader.read();
// Save the EOL state
if (firstEol == null) {
this.firstEol = Constants.CRLF;
}
}
// save EOL state here.
if (firstEol == null) {
if (cur == Constants.LF) {
this.firstEol = LF_STRING;
} else if (cur == Constants.CR) {
this.firstEol = CR_STRING;
}
}
return cur == Constants.LF || cur == Constants.CR;
}
// TODO escape handling needs more work
/**
* Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
* {@link ExtendedBufferedReader#getLastChar()} on the input stream.
*
* @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
* @throws IOException if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
* @throws CSVException Thrown on invalid input.
*/
int readEscape() throws IOException {
// the escape char has just been read (normally a backslash)
final int ch = reader.read();
switch (ch) {
case 'r':
return Constants.CR;
case 'n':
return Constants.LF;
case 't':
return Constants.TAB;
case 'b':
return Constants.BACKSPACE;
case 'f':
return Constants.FF;
case Constants.CR:
case Constants.LF:
case Constants.FF: // TODO is this correct?
case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
case Constants.BACKSPACE: // TODO is this correct?
return ch;
case EOF:
throw new CSVException("EOF while processing escape sequence");
default:
// Now check for meta-characters
if (isMetaChar(ch)) {
return ch;
}
// indicate unexpected char - available from in.getLastChar()
return EOF;
}
}
void trimTrailingSpaces(final StringBuilder buffer) {
int length = buffer.length();
while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
length--;
}
if (length != buffer.length()) {
buffer.setLength(length);
}
}
}