Lexer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.commons.csv;

import static org.apache.commons.io.IOUtils.EOF;

import java.io.Closeable;
import java.io.IOException;

import org.apache.commons.io.IOUtils;

/**
 * Lexical analyzer.
 */
final class Lexer implements Closeable {

    private static final String CR_STRING = Character.toString(Constants.CR);
    private static final String LF_STRING = Character.toString(Constants.LF);

    private final char[] delimiter;
    private final char[] delimiterBuf;
    private final char[] escapeDelimiterBuf;
    private final int escape;
    private final int quoteChar;
    private final int commentStart;
    private final boolean ignoreSurroundingSpaces;
    private final boolean ignoreEmptyLines;
    private final boolean lenientEof;
    private final boolean trailingData;

    /** The buffered reader. */
    private final ExtendedBufferedReader reader;
    private String firstEol;

    private boolean isLastTokenDelimiter;

    Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
        this.reader = reader;
        this.delimiter = format.getDelimiterCharArray();
        this.escape = nullToDisabled(format.getEscapeCharacter());
        this.quoteChar = nullToDisabled(format.getQuoteCharacter());
        this.commentStart = nullToDisabled(format.getCommentMarker());
        this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
        this.ignoreEmptyLines = format.getIgnoreEmptyLines();
        this.lenientEof = format.getLenientEof();
        this.trailingData = format.getTrailingData();
        this.delimiterBuf = new char[delimiter.length - 1];
        this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
    }

    /**
     * Appends the next escaped character to the token's content.
     *
     * @param token the current token
     * @throws IOException  on stream access error
     * @throws CSVException Thrown on invalid input.
     */
    private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
        if (isEscapeDelimiter()) {
            token.content.append(delimiter);
        } else {
            final int unescaped = readEscape();
            if (unescaped == EOF) { // unexpected char after escape
                token.content.append((char) escape).append((char) reader.getLastChar());
            } else {
                token.content.append((char) unescaped);
            }
        }
    }

    /**
     * Closes resources.
     *
     * @throws IOException
     *             If an I/O error occurs
     */
    @Override
    public void close() throws IOException {
        reader.close();
    }

    /**
     * Gets the number of bytes read
     *
     * @return the number of bytes read
     */
    long getBytesRead() {
        return reader.getBytesRead();
    }

    /**
     * Returns the current character position
     *
     * @return the current character position
     */
    long getCharacterPosition() {
        return reader.getPosition();
    }

    /**
     * Returns the current line number
     *
     * @return the current line number
     */
    long getCurrentLineNumber() {
        return reader.getLineNumber();
    }

    String getFirstEol() {
        return firstEol;
    }

    boolean isClosed() {
        return reader.isClosed();
    }

    boolean isCommentStart(final int ch) {
        return ch == commentStart;
    }

    /**
     * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
     *
     * @param ch
     *             the current character.
     * @return true if the next characters constitute a delimiter.
     * @throws IOException If an I/O error occurs.
     */
    boolean isDelimiter(final int ch) throws IOException {
        isLastTokenDelimiter = false;
        if (ch != delimiter[0]) {
            return false;
        }
        if (delimiter.length == 1) {
            isLastTokenDelimiter = true;
            return true;
        }
        reader.peek(delimiterBuf);
        for (int i = 0; i < delimiterBuf.length; i++) {
            if (delimiterBuf[i] != delimiter[i + 1]) {
                return false;
            }
        }
        final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
        isLastTokenDelimiter = count != EOF;
        return isLastTokenDelimiter;
    }

    /**
     * Tests if the given character indicates the end of the file.
     *
     * @return true if the given character indicates the end of the file.
     */
    boolean isEndOfFile(final int ch) {
        return ch == EOF;
    }

    /**
     * Tests if the given character is the escape character.
     *
     * @return true if the given character is the escape character.
     */
    boolean isEscape(final int ch) {
        return ch == escape;
    }

    /**
     * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
     *
     * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
     *
     * @return true if the next characters constitute an escape delimiter.
     * @throws IOException If an I/O error occurs.
     */
    boolean isEscapeDelimiter() throws IOException {
        reader.peek(escapeDelimiterBuf);
        if (escapeDelimiterBuf[0] != delimiter[0]) {
            return false;
        }
        for (int i = 1; i < delimiter.length; i++) {
            if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
                return false;
            }
        }
        final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
        return count != EOF;
    }

    private boolean isMetaChar(final int ch) {
        return ch == escape || ch == quoteChar || ch == commentStart;
    }

    boolean isQuoteChar(final int ch) {
        return ch == quoteChar;
    }

    /**
     * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
     *
     * @param ch the character to check
     * @return true if the character is at the start of a line.
     */
    boolean isStartOfLine(final int ch) {
        return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
    }

    /**
     * Returns the next token.
     * <p>
     * A token corresponds to a term, a record change or an end-of-file indicator.
     * </p>
     *
     * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
     * @return the next token found.
     * @throws IOException  on stream access error.
     * @throws CSVException Thrown on invalid input.
     */
    Token nextToken(final Token token) throws IOException {
        // Get the last read char (required for empty line detection)
        int lastChar = reader.getLastChar();
        // read the next char and set eol
        int c = reader.read();
        // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
        boolean eol = readEndOfLine(c);
        // empty line detection: eol AND (last char was EOL or beginning)
        if (ignoreEmptyLines) {
            while (eol && isStartOfLine(lastChar)) {
                // Go on char ahead ...
                lastChar = c;
                c = reader.read();
                eol = readEndOfLine(c);
                // reached the end of the file without any content (empty line at the end)
                if (isEndOfFile(c)) {
                    token.type = Token.Type.EOF;
                    // don't set token.isReady here because no content
                    return token;
                }
            }
        }
        // Did we reach EOF during the last iteration already? EOF
        if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
            token.type = Token.Type.EOF;
            // don't set token.isReady here because no content
            return token;
        }
        if (isStartOfLine(lastChar) && isCommentStart(c)) {
            final String line = reader.readLine();
            if (line == null) {
                token.type = Token.Type.EOF;
                // don't set token.isReady here because no content
                return token;
            }
            final String comment = line.trim();
            token.content.append(comment);
            token.type = Token.Type.COMMENT;
            return token;
        }
        // Important: make sure a new char gets consumed in each iteration
        while (token.type == Token.Type.INVALID) {
            // ignore whitespaces at beginning of a token
            if (ignoreSurroundingSpaces) {
                while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
                    c = reader.read();
                    eol = readEndOfLine(c);
                }
            }
            // ok, start of token reached: encapsulated, or token
            if (isDelimiter(c)) {
                // empty token return TOKEN("")
                token.type = Token.Type.TOKEN;
            } else if (eol) {
                // empty token return EORECORD("")
                // noop: token.content.append("");
                token.type = Token.Type.EORECORD;
            } else if (isQuoteChar(c)) {
                // consume encapsulated token
                parseEncapsulatedToken(token);
            } else if (isEndOfFile(c)) {
                // end of file return EOF()
                // noop: token.content.append("");
                token.type = Token.Type.EOF;
                token.isReady = true; // there is data at EOF
            } else {
                // next token must be a simple token
                // add removed blanks when not ignoring whitespace chars...
                parseSimpleToken(token, c);
            }
        }
        return token;
    }

    private int nullToDisabled(final Character c) {
        return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
    }

    /**
     * Parses an encapsulated token.
     * <p>
     * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
     * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
     * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
     * </p>
     * <ul>
     * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
     * <ul>
     * <li>delimiter (TOKEN)</li>
     * <li>end of line (EORECORD)</li>
     * </ul>
     * <li>end of stream has been reached (EOF)</li> </ul>
     *
     * @param token
     *            the current token
     * @return a valid token object
     * @throws IOException
     *             Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
     *             delimiter or EOL.
     * @throws CSVException Thrown on invalid input.
     */
    private Token parseEncapsulatedToken(final Token token) throws IOException {
        token.isQuoted = true;
        // Save current line number in case needed for IOE
        final long startLineNumber = getCurrentLineNumber();
        int c;
        while (true) {
            c = reader.read();

            if (isQuoteChar(c)) {
                if (isQuoteChar(reader.peek())) {
                    // double or escaped encapsulator -> add single encapsulator to token
                    c = reader.read();
                    token.content.append((char) c);
                } else {
                    // token finish mark (encapsulator) reached: ignore whitespace till delimiter
                    while (true) {
                        c = reader.read();
                        if (isDelimiter(c)) {
                            token.type = Token.Type.TOKEN;
                            return token;
                        }
                        if (isEndOfFile(c)) {
                            token.type = Token.Type.EOF;
                            token.isReady = true; // There is data at EOF
                            return token;
                        }
                        if (readEndOfLine(c)) {
                            token.type = Token.Type.EORECORD;
                            return token;
                        }
                        if (trailingData) {
                            token.content.append((char) c);
                        } else if (!Character.isWhitespace((char) c)) {
                            // error invalid char between token and next delimiter
                            throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
                                    getCurrentLineNumber(), getCharacterPosition());
                        }
                    }
                }
            } else if (isEscape(c)) {
                appendNextEscapedCharacterToToken(token);
            } else if (isEndOfFile(c)) {
                if (lenientEof) {
                    token.type = Token.Type.EOF;
                    token.isReady = true; // There is data at EOF
                    return token;
                }
                // error condition (end of file before end of token)
                throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
            } else {
                // consume character
                token.content.append((char) c);
            }
        }
    }

    /**
     * Parses a simple token.
     * <p>
     * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
     * when one of the following conditions becomes true:
     * </p>
     * <ul>
     * <li>The end of line has been reached (EORECORD)</li>
     * <li>The end of stream has been reached (EOF)</li>
     * <li>An unescaped delimiter has been reached (TOKEN)</li>
     * </ul>
     *
     * @param token the current token
     * @param ch     the current character
     * @return the filled token
     * @throws IOException  on stream access error
     * @throws CSVException Thrown on invalid input.
     */
    private Token parseSimpleToken(final Token token, final int ch) throws IOException {
        // Faster to use while(true)+break than while(token.type == INVALID)
        int cur = ch;
        while (true) {
            if (readEndOfLine(cur)) {
                token.type = Token.Type.EORECORD;
                break;
            }
            if (isEndOfFile(cur)) {
                token.type = Token.Type.EOF;
                token.isReady = true; // There is data at EOF
                break;
            }
            if (isDelimiter(cur)) {
                token.type = Token.Type.TOKEN;
                break;
            }
            // continue
            if (isEscape(cur)) {
                appendNextEscapedCharacterToToken(token);
            } else {
                token.content.append((char) cur);
            }
            cur = reader.read(); // continue
        }

        if (ignoreSurroundingSpaces) {
            trimTrailingSpaces(token.content);
        }

        return token;
    }

    /**
     * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
     *
     * @return true if the given or next character is a line-terminator
     */
    boolean readEndOfLine(final int ch) throws IOException {
        // check if we have \r\n...
        int cur = ch;
        if (cur == Constants.CR && reader.peek() == Constants.LF) {
            // note: does not change ch outside of this method!
            cur = reader.read();
            // Save the EOL state
            if (firstEol == null) {
                this.firstEol = Constants.CRLF;
            }
        }
        // save EOL state here.
        if (firstEol == null) {
            if (cur == Constants.LF) {
                this.firstEol = LF_STRING;
            } else if (cur == Constants.CR) {
                this.firstEol = CR_STRING;
            }
        }

        return cur == Constants.LF || cur == Constants.CR;
    }

    // TODO escape handling needs more work
    /**
     * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
     * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
     *
     * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
     * @throws IOException  if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
     * @throws CSVException Thrown on invalid input.
     */
    int readEscape() throws IOException {
        // the escape char has just been read (normally a backslash)
        final int ch = reader.read();
        switch (ch) {
        case 'r':
            return Constants.CR;
        case 'n':
            return Constants.LF;
        case 't':
            return Constants.TAB;
        case 'b':
            return Constants.BACKSPACE;
        case 'f':
            return Constants.FF;
        case Constants.CR:
        case Constants.LF:
        case Constants.FF: // TODO is this correct?
        case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
        case Constants.BACKSPACE: // TODO is this correct?
            return ch;
        case EOF:
            throw new CSVException("EOF while processing escape sequence");
        default:
            // Now check for meta-characters
            if (isMetaChar(ch)) {
                return ch;
            }
            // indicate unexpected char - available from in.getLastChar()
            return EOF;
        }
    }

    void trimTrailingSpaces(final StringBuilder buffer) {
        int length = buffer.length();
        while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
            length--;
        }
        if (length != buffer.length()) {
            buffer.setLength(length);
        }
    }
}