Lexer.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
- package org.apache.commons.csv;
- import static org.apache.commons.io.IOUtils.EOF;
- import java.io.Closeable;
- import java.io.IOException;
- import org.apache.commons.io.IOUtils;
- /**
- * Lexical analyzer.
- */
- final class Lexer implements Closeable {
- private static final String CR_STRING = Character.toString(Constants.CR);
- private static final String LF_STRING = Character.toString(Constants.LF);
- private final char[] delimiter;
- private final char[] delimiterBuf;
- private final char[] escapeDelimiterBuf;
- private final int escape;
- private final int quoteChar;
- private final int commentStart;
- private final boolean ignoreSurroundingSpaces;
- private final boolean ignoreEmptyLines;
- private final boolean lenientEof;
- private final boolean trailingData;
- /** The buffered reader. */
- private final ExtendedBufferedReader reader;
- private String firstEol;
- private boolean isLastTokenDelimiter;
- Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
- this.reader = reader;
- this.delimiter = format.getDelimiterCharArray();
- this.escape = nullToDisabled(format.getEscapeCharacter());
- this.quoteChar = nullToDisabled(format.getQuoteCharacter());
- this.commentStart = nullToDisabled(format.getCommentMarker());
- this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
- this.ignoreEmptyLines = format.getIgnoreEmptyLines();
- this.lenientEof = format.getLenientEof();
- this.trailingData = format.getTrailingData();
- this.delimiterBuf = new char[delimiter.length - 1];
- this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
- }
- /**
- * Appends the next escaped character to the token's content.
- *
- * @param token the current token
- * @throws IOException on stream access error
- * @throws CSVException Thrown on invalid input.
- */
- private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
- if (isEscapeDelimiter()) {
- token.content.append(delimiter);
- } else {
- final int unescaped = readEscape();
- if (unescaped == EOF) { // unexpected char after escape
- token.content.append((char) escape).append((char) reader.getLastChar());
- } else {
- token.content.append((char) unescaped);
- }
- }
- }
- /**
- * Closes resources.
- *
- * @throws IOException
- * If an I/O error occurs
- */
- @Override
- public void close() throws IOException {
- reader.close();
- }
- /**
- * Gets the number of bytes read
- *
- * @return the number of bytes read
- */
- long getBytesRead() {
- return reader.getBytesRead();
- }
- /**
- * Returns the current character position
- *
- * @return the current character position
- */
- long getCharacterPosition() {
- return reader.getPosition();
- }
- /**
- * Returns the current line number
- *
- * @return the current line number
- */
- long getCurrentLineNumber() {
- return reader.getLineNumber();
- }
- String getFirstEol() {
- return firstEol;
- }
- boolean isClosed() {
- return reader.isClosed();
- }
- boolean isCommentStart(final int ch) {
- return ch == commentStart;
- }
- /**
- * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
- *
- * @param ch
- * the current character.
- * @return true if the next characters constitute a delimiter.
- * @throws IOException If an I/O error occurs.
- */
- boolean isDelimiter(final int ch) throws IOException {
- isLastTokenDelimiter = false;
- if (ch != delimiter[0]) {
- return false;
- }
- if (delimiter.length == 1) {
- isLastTokenDelimiter = true;
- return true;
- }
- reader.peek(delimiterBuf);
- for (int i = 0; i < delimiterBuf.length; i++) {
- if (delimiterBuf[i] != delimiter[i + 1]) {
- return false;
- }
- }
- final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
- isLastTokenDelimiter = count != EOF;
- return isLastTokenDelimiter;
- }
- /**
- * Tests if the given character indicates the end of the file.
- *
- * @return true if the given character indicates the end of the file.
- */
- boolean isEndOfFile(final int ch) {
- return ch == EOF;
- }
- /**
- * Tests if the given character is the escape character.
- *
- * @return true if the given character is the escape character.
- */
- boolean isEscape(final int ch) {
- return ch == escape;
- }
- /**
- * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
- *
- * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
- *
- * @return true if the next characters constitute an escape delimiter.
- * @throws IOException If an I/O error occurs.
- */
- boolean isEscapeDelimiter() throws IOException {
- reader.peek(escapeDelimiterBuf);
- if (escapeDelimiterBuf[0] != delimiter[0]) {
- return false;
- }
- for (int i = 1; i < delimiter.length; i++) {
- if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
- return false;
- }
- }
- final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
- return count != EOF;
- }
- private boolean isMetaChar(final int ch) {
- return ch == escape || ch == quoteChar || ch == commentStart;
- }
- boolean isQuoteChar(final int ch) {
- return ch == quoteChar;
- }
- /**
- * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
- *
- * @param ch the character to check
- * @return true if the character is at the start of a line.
- */
- boolean isStartOfLine(final int ch) {
- return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
- }
- /**
- * Returns the next token.
- * <p>
- * A token corresponds to a term, a record change or an end-of-file indicator.
- * </p>
- *
- * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
- * @return the next token found.
- * @throws IOException on stream access error.
- * @throws CSVException Thrown on invalid input.
- */
- Token nextToken(final Token token) throws IOException {
- // Get the last read char (required for empty line detection)
- int lastChar = reader.getLastChar();
- // read the next char and set eol
- int c = reader.read();
- // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
- boolean eol = readEndOfLine(c);
- // empty line detection: eol AND (last char was EOL or beginning)
- if (ignoreEmptyLines) {
- while (eol && isStartOfLine(lastChar)) {
- // Go on char ahead ...
- lastChar = c;
- c = reader.read();
- eol = readEndOfLine(c);
- // reached the end of the file without any content (empty line at the end)
- if (isEndOfFile(c)) {
- token.type = Token.Type.EOF;
- // don't set token.isReady here because no content
- return token;
- }
- }
- }
- // Did we reach EOF during the last iteration already? EOF
- if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
- token.type = Token.Type.EOF;
- // don't set token.isReady here because no content
- return token;
- }
- if (isStartOfLine(lastChar) && isCommentStart(c)) {
- final String line = reader.readLine();
- if (line == null) {
- token.type = Token.Type.EOF;
- // don't set token.isReady here because no content
- return token;
- }
- final String comment = line.trim();
- token.content.append(comment);
- token.type = Token.Type.COMMENT;
- return token;
- }
- // Important: make sure a new char gets consumed in each iteration
- while (token.type == Token.Type.INVALID) {
- // ignore whitespaces at beginning of a token
- if (ignoreSurroundingSpaces) {
- while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
- c = reader.read();
- eol = readEndOfLine(c);
- }
- }
- // ok, start of token reached: encapsulated, or token
- if (isDelimiter(c)) {
- // empty token return TOKEN("")
- token.type = Token.Type.TOKEN;
- } else if (eol) {
- // empty token return EORECORD("")
- // noop: token.content.append("");
- token.type = Token.Type.EORECORD;
- } else if (isQuoteChar(c)) {
- // consume encapsulated token
- parseEncapsulatedToken(token);
- } else if (isEndOfFile(c)) {
- // end of file return EOF()
- // noop: token.content.append("");
- token.type = Token.Type.EOF;
- token.isReady = true; // there is data at EOF
- } else {
- // next token must be a simple token
- // add removed blanks when not ignoring whitespace chars...
- parseSimpleToken(token, c);
- }
- }
- return token;
- }
- private int nullToDisabled(final Character c) {
- return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
- }
- /**
- * Parses an encapsulated token.
- * <p>
- * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
- * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
- * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
- * </p>
- * <ul>
- * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
- * <ul>
- * <li>delimiter (TOKEN)</li>
- * <li>end of line (EORECORD)</li>
- * </ul>
- * <li>end of stream has been reached (EOF)</li> </ul>
- *
- * @param token
- * the current token
- * @return a valid token object
- * @throws IOException
- * Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
- * delimiter or EOL.
- * @throws CSVException Thrown on invalid input.
- */
- private Token parseEncapsulatedToken(final Token token) throws IOException {
- token.isQuoted = true;
- // Save current line number in case needed for IOE
- final long startLineNumber = getCurrentLineNumber();
- int c;
- while (true) {
- c = reader.read();
- if (isQuoteChar(c)) {
- if (isQuoteChar(reader.peek())) {
- // double or escaped encapsulator -> add single encapsulator to token
- c = reader.read();
- token.content.append((char) c);
- } else {
- // token finish mark (encapsulator) reached: ignore whitespace till delimiter
- while (true) {
- c = reader.read();
- if (isDelimiter(c)) {
- token.type = Token.Type.TOKEN;
- return token;
- }
- if (isEndOfFile(c)) {
- token.type = Token.Type.EOF;
- token.isReady = true; // There is data at EOF
- return token;
- }
- if (readEndOfLine(c)) {
- token.type = Token.Type.EORECORD;
- return token;
- }
- if (trailingData) {
- token.content.append((char) c);
- } else if (!Character.isWhitespace((char) c)) {
- // error invalid char between token and next delimiter
- throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
- getCurrentLineNumber(), getCharacterPosition());
- }
- }
- }
- } else if (isEscape(c)) {
- appendNextEscapedCharacterToToken(token);
- } else if (isEndOfFile(c)) {
- if (lenientEof) {
- token.type = Token.Type.EOF;
- token.isReady = true; // There is data at EOF
- return token;
- }
- // error condition (end of file before end of token)
- throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
- } else {
- // consume character
- token.content.append((char) c);
- }
- }
- }
- /**
- * Parses a simple token.
- * <p>
- * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
- * when one of the following conditions becomes true:
- * </p>
- * <ul>
- * <li>The end of line has been reached (EORECORD)</li>
- * <li>The end of stream has been reached (EOF)</li>
- * <li>An unescaped delimiter has been reached (TOKEN)</li>
- * </ul>
- *
- * @param token the current token
- * @param ch the current character
- * @return the filled token
- * @throws IOException on stream access error
- * @throws CSVException Thrown on invalid input.
- */
- private Token parseSimpleToken(final Token token, final int ch) throws IOException {
- // Faster to use while(true)+break than while(token.type == INVALID)
- int cur = ch;
- while (true) {
- if (readEndOfLine(cur)) {
- token.type = Token.Type.EORECORD;
- break;
- }
- if (isEndOfFile(cur)) {
- token.type = Token.Type.EOF;
- token.isReady = true; // There is data at EOF
- break;
- }
- if (isDelimiter(cur)) {
- token.type = Token.Type.TOKEN;
- break;
- }
- // continue
- if (isEscape(cur)) {
- appendNextEscapedCharacterToToken(token);
- } else {
- token.content.append((char) cur);
- }
- cur = reader.read(); // continue
- }
- if (ignoreSurroundingSpaces) {
- trimTrailingSpaces(token.content);
- }
- return token;
- }
- /**
- * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
- *
- * @return true if the given or next character is a line-terminator
- */
- boolean readEndOfLine(final int ch) throws IOException {
- // check if we have \r\n...
- int cur = ch;
- if (cur == Constants.CR && reader.peek() == Constants.LF) {
- // note: does not change ch outside of this method!
- cur = reader.read();
- // Save the EOL state
- if (firstEol == null) {
- this.firstEol = Constants.CRLF;
- }
- }
- // save EOL state here.
- if (firstEol == null) {
- if (cur == Constants.LF) {
- this.firstEol = LF_STRING;
- } else if (cur == Constants.CR) {
- this.firstEol = CR_STRING;
- }
- }
- return cur == Constants.LF || cur == Constants.CR;
- }
- // TODO escape handling needs more work
- /**
- * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
- * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
- *
- * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
- * @throws IOException if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
- * @throws CSVException Thrown on invalid input.
- */
- int readEscape() throws IOException {
- // the escape char has just been read (normally a backslash)
- final int ch = reader.read();
- switch (ch) {
- case 'r':
- return Constants.CR;
- case 'n':
- return Constants.LF;
- case 't':
- return Constants.TAB;
- case 'b':
- return Constants.BACKSPACE;
- case 'f':
- return Constants.FF;
- case Constants.CR:
- case Constants.LF:
- case Constants.FF: // TODO is this correct?
- case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
- case Constants.BACKSPACE: // TODO is this correct?
- return ch;
- case EOF:
- throw new CSVException("EOF while processing escape sequence");
- default:
- // Now check for meta-characters
- if (isMetaChar(ch)) {
- return ch;
- }
- // indicate unexpected char - available from in.getLastChar()
- return EOF;
- }
- }
- void trimTrailingSpaces(final StringBuilder buffer) {
- int length = buffer.length();
- while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
- length--;
- }
- if (length != buffer.length()) {
- buffer.setLength(length);
- }
- }
- }