Lexer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  *   https://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */

  19. package org.apache.commons.csv;

  20. import static org.apache.commons.io.IOUtils.EOF;

  21. import java.io.Closeable;
  22. import java.io.IOException;

  23. import org.apache.commons.io.IOUtils;

  24. /**
  25.  * Lexical analyzer.
  26.  */
  27. final class Lexer implements Closeable {

  28.     private static final String CR_STRING = Character.toString(Constants.CR);
  29.     private static final String LF_STRING = Character.toString(Constants.LF);

  30.     private final char[] delimiter;
  31.     private final char[] delimiterBuf;
  32.     private final char[] escapeDelimiterBuf;
  33.     private final int escape;
  34.     private final int quoteChar;
  35.     private final int commentStart;
  36.     private final boolean ignoreSurroundingSpaces;
  37.     private final boolean ignoreEmptyLines;
  38.     private final boolean lenientEof;
  39.     private final boolean trailingData;

  40.     /** The buffered reader. */
  41.     private final ExtendedBufferedReader reader;
  42.     private String firstEol;

  43.     private boolean isLastTokenDelimiter;

  44.     Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
  45.         this.reader = reader;
  46.         this.delimiter = format.getDelimiterCharArray();
  47.         this.escape = nullToDisabled(format.getEscapeCharacter());
  48.         this.quoteChar = nullToDisabled(format.getQuoteCharacter());
  49.         this.commentStart = nullToDisabled(format.getCommentMarker());
  50.         this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
  51.         this.ignoreEmptyLines = format.getIgnoreEmptyLines();
  52.         this.lenientEof = format.getLenientEof();
  53.         this.trailingData = format.getTrailingData();
  54.         this.delimiterBuf = new char[delimiter.length - 1];
  55.         this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
  56.     }

  57.     /**
  58.      * Appends the next escaped character to the token's content.
  59.      *
  60.      * @param token the current token
  61.      * @throws IOException  on stream access error
  62.      * @throws CSVException Thrown on invalid input.
  63.      */
  64.     private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
  65.         if (isEscapeDelimiter()) {
  66.             token.content.append(delimiter);
  67.         } else {
  68.             final int unescaped = readEscape();
  69.             if (unescaped == EOF) { // unexpected char after escape
  70.                 token.content.append((char) escape).append((char) reader.getLastChar());
  71.             } else {
  72.                 token.content.append((char) unescaped);
  73.             }
  74.         }
  75.     }

  76.     /**
  77.      * Closes resources.
  78.      *
  79.      * @throws IOException
  80.      *             If an I/O error occurs
  81.      */
  82.     @Override
  83.     public void close() throws IOException {
  84.         reader.close();
  85.     }

  86.     /**
  87.      * Gets the number of bytes read
  88.      *
  89.      * @return the number of bytes read
  90.      */
  91.     long getBytesRead() {
  92.         return reader.getBytesRead();
  93.     }

  94.     /**
  95.      * Returns the current character position
  96.      *
  97.      * @return the current character position
  98.      */
  99.     long getCharacterPosition() {
  100.         return reader.getPosition();
  101.     }

  102.     /**
  103.      * Returns the current line number
  104.      *
  105.      * @return the current line number
  106.      */
  107.     long getCurrentLineNumber() {
  108.         return reader.getLineNumber();
  109.     }

  110.     String getFirstEol() {
  111.         return firstEol;
  112.     }

  113.     boolean isClosed() {
  114.         return reader.isClosed();
  115.     }

  116.     boolean isCommentStart(final int ch) {
  117.         return ch == commentStart;
  118.     }

  119.     /**
  120.      * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
  121.      *
  122.      * @param ch
  123.      *             the current character.
  124.      * @return true if the next characters constitute a delimiter.
  125.      * @throws IOException If an I/O error occurs.
  126.      */
  127.     boolean isDelimiter(final int ch) throws IOException {
  128.         isLastTokenDelimiter = false;
  129.         if (ch != delimiter[0]) {
  130.             return false;
  131.         }
  132.         if (delimiter.length == 1) {
  133.             isLastTokenDelimiter = true;
  134.             return true;
  135.         }
  136.         reader.peek(delimiterBuf);
  137.         for (int i = 0; i < delimiterBuf.length; i++) {
  138.             if (delimiterBuf[i] != delimiter[i + 1]) {
  139.                 return false;
  140.             }
  141.         }
  142.         final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
  143.         isLastTokenDelimiter = count != EOF;
  144.         return isLastTokenDelimiter;
  145.     }

  146.     /**
  147.      * Tests if the given character indicates the end of the file.
  148.      *
  149.      * @return true if the given character indicates the end of the file.
  150.      */
  151.     boolean isEndOfFile(final int ch) {
  152.         return ch == EOF;
  153.     }

  154.     /**
  155.      * Tests if the given character is the escape character.
  156.      *
  157.      * @return true if the given character is the escape character.
  158.      */
  159.     boolean isEscape(final int ch) {
  160.         return ch == escape;
  161.     }

  162.     /**
  163.      * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
  164.      *
  165.      * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
  166.      *
  167.      * @return true if the next characters constitute an escape delimiter.
  168.      * @throws IOException If an I/O error occurs.
  169.      */
  170.     boolean isEscapeDelimiter() throws IOException {
  171.         reader.peek(escapeDelimiterBuf);
  172.         if (escapeDelimiterBuf[0] != delimiter[0]) {
  173.             return false;
  174.         }
  175.         for (int i = 1; i < delimiter.length; i++) {
  176.             if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
  177.                 return false;
  178.             }
  179.         }
  180.         final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
  181.         return count != EOF;
  182.     }

  183.     private boolean isMetaChar(final int ch) {
  184.         return ch == escape || ch == quoteChar || ch == commentStart;
  185.     }

  186.     boolean isQuoteChar(final int ch) {
  187.         return ch == quoteChar;
  188.     }

  189.     /**
  190.      * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
  191.      *
  192.      * @param ch the character to check
  193.      * @return true if the character is at the start of a line.
  194.      */
  195.     boolean isStartOfLine(final int ch) {
  196.         return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
  197.     }

  198.     /**
  199.      * Returns the next token.
  200.      * <p>
  201.      * A token corresponds to a term, a record change or an end-of-file indicator.
  202.      * </p>
  203.      *
  204.      * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
  205.      * @return the next token found.
  206.      * @throws IOException  on stream access error.
  207.      * @throws CSVException Thrown on invalid input.
  208.      */
  209.     Token nextToken(final Token token) throws IOException {
  210.         // Get the last read char (required for empty line detection)
  211.         int lastChar = reader.getLastChar();
  212.         // read the next char and set eol
  213.         int c = reader.read();
  214.         // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
  215.         boolean eol = readEndOfLine(c);
  216.         // empty line detection: eol AND (last char was EOL or beginning)
  217.         if (ignoreEmptyLines) {
  218.             while (eol && isStartOfLine(lastChar)) {
  219.                 // Go on char ahead ...
  220.                 lastChar = c;
  221.                 c = reader.read();
  222.                 eol = readEndOfLine(c);
  223.                 // reached the end of the file without any content (empty line at the end)
  224.                 if (isEndOfFile(c)) {
  225.                     token.type = Token.Type.EOF;
  226.                     // don't set token.isReady here because no content
  227.                     return token;
  228.                 }
  229.             }
  230.         }
  231.         // Did we reach EOF during the last iteration already? EOF
  232.         if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
  233.             token.type = Token.Type.EOF;
  234.             // don't set token.isReady here because no content
  235.             return token;
  236.         }
  237.         if (isStartOfLine(lastChar) && isCommentStart(c)) {
  238.             final String line = reader.readLine();
  239.             if (line == null) {
  240.                 token.type = Token.Type.EOF;
  241.                 // don't set token.isReady here because no content
  242.                 return token;
  243.             }
  244.             final String comment = line.trim();
  245.             token.content.append(comment);
  246.             token.type = Token.Type.COMMENT;
  247.             return token;
  248.         }
  249.         // Important: make sure a new char gets consumed in each iteration
  250.         while (token.type == Token.Type.INVALID) {
  251.             // ignore whitespaces at beginning of a token
  252.             if (ignoreSurroundingSpaces) {
  253.                 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
  254.                     c = reader.read();
  255.                     eol = readEndOfLine(c);
  256.                 }
  257.             }
  258.             // ok, start of token reached: encapsulated, or token
  259.             if (isDelimiter(c)) {
  260.                 // empty token return TOKEN("")
  261.                 token.type = Token.Type.TOKEN;
  262.             } else if (eol) {
  263.                 // empty token return EORECORD("")
  264.                 // noop: token.content.append("");
  265.                 token.type = Token.Type.EORECORD;
  266.             } else if (isQuoteChar(c)) {
  267.                 // consume encapsulated token
  268.                 parseEncapsulatedToken(token);
  269.             } else if (isEndOfFile(c)) {
  270.                 // end of file return EOF()
  271.                 // noop: token.content.append("");
  272.                 token.type = Token.Type.EOF;
  273.                 token.isReady = true; // there is data at EOF
  274.             } else {
  275.                 // next token must be a simple token
  276.                 // add removed blanks when not ignoring whitespace chars...
  277.                 parseSimpleToken(token, c);
  278.             }
  279.         }
  280.         return token;
  281.     }

  282.     private int nullToDisabled(final Character c) {
  283.         return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
  284.     }

  285.     /**
  286.      * Parses an encapsulated token.
  287.      * <p>
  288.      * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
  289.      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
  290.      * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
  291.      * </p>
  292.      * <ul>
  293.      * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
  294.      * <ul>
  295.      * <li>delimiter (TOKEN)</li>
  296.      * <li>end of line (EORECORD)</li>
  297.      * </ul>
  298.      * <li>end of stream has been reached (EOF)</li> </ul>
  299.      *
  300.      * @param token
  301.      *            the current token
  302.      * @return a valid token object
  303.      * @throws IOException
  304.      *             Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
  305.      *             delimiter or EOL.
  306.      * @throws CSVException Thrown on invalid input.
  307.      */
  308.     private Token parseEncapsulatedToken(final Token token) throws IOException {
  309.         token.isQuoted = true;
  310.         // Save current line number in case needed for IOE
  311.         final long startLineNumber = getCurrentLineNumber();
  312.         int c;
  313.         while (true) {
  314.             c = reader.read();

  315.             if (isQuoteChar(c)) {
  316.                 if (isQuoteChar(reader.peek())) {
  317.                     // double or escaped encapsulator -> add single encapsulator to token
  318.                     c = reader.read();
  319.                     token.content.append((char) c);
  320.                 } else {
  321.                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
  322.                     while (true) {
  323.                         c = reader.read();
  324.                         if (isDelimiter(c)) {
  325.                             token.type = Token.Type.TOKEN;
  326.                             return token;
  327.                         }
  328.                         if (isEndOfFile(c)) {
  329.                             token.type = Token.Type.EOF;
  330.                             token.isReady = true; // There is data at EOF
  331.                             return token;
  332.                         }
  333.                         if (readEndOfLine(c)) {
  334.                             token.type = Token.Type.EORECORD;
  335.                             return token;
  336.                         }
  337.                         if (trailingData) {
  338.                             token.content.append((char) c);
  339.                         } else if (!Character.isWhitespace((char) c)) {
  340.                             // error invalid char between token and next delimiter
  341.                             throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
  342.                                     getCurrentLineNumber(), getCharacterPosition());
  343.                         }
  344.                     }
  345.                 }
  346.             } else if (isEscape(c)) {
  347.                 appendNextEscapedCharacterToToken(token);
  348.             } else if (isEndOfFile(c)) {
  349.                 if (lenientEof) {
  350.                     token.type = Token.Type.EOF;
  351.                     token.isReady = true; // There is data at EOF
  352.                     return token;
  353.                 }
  354.                 // error condition (end of file before end of token)
  355.                 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
  356.             } else {
  357.                 // consume character
  358.                 token.content.append((char) c);
  359.             }
  360.         }
  361.     }

  362.     /**
  363.      * Parses a simple token.
  364.      * <p>
  365.      * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
  366.      * when one of the following conditions becomes true:
  367.      * </p>
  368.      * <ul>
  369.      * <li>The end of line has been reached (EORECORD)</li>
  370.      * <li>The end of stream has been reached (EOF)</li>
  371.      * <li>An unescaped delimiter has been reached (TOKEN)</li>
  372.      * </ul>
  373.      *
  374.      * @param token the current token
  375.      * @param ch     the current character
  376.      * @return the filled token
  377.      * @throws IOException  on stream access error
  378.      * @throws CSVException Thrown on invalid input.
  379.      */
  380.     private Token parseSimpleToken(final Token token, final int ch) throws IOException {
  381.         // Faster to use while(true)+break than while(token.type == INVALID)
  382.         int cur = ch;
  383.         while (true) {
  384.             if (readEndOfLine(cur)) {
  385.                 token.type = Token.Type.EORECORD;
  386.                 break;
  387.             }
  388.             if (isEndOfFile(cur)) {
  389.                 token.type = Token.Type.EOF;
  390.                 token.isReady = true; // There is data at EOF
  391.                 break;
  392.             }
  393.             if (isDelimiter(cur)) {
  394.                 token.type = Token.Type.TOKEN;
  395.                 break;
  396.             }
  397.             // continue
  398.             if (isEscape(cur)) {
  399.                 appendNextEscapedCharacterToToken(token);
  400.             } else {
  401.                 token.content.append((char) cur);
  402.             }
  403.             cur = reader.read(); // continue
  404.         }

  405.         if (ignoreSurroundingSpaces) {
  406.             trimTrailingSpaces(token.content);
  407.         }

  408.         return token;
  409.     }

  410.     /**
  411.      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
  412.      *
  413.      * @return true if the given or next character is a line-terminator
  414.      */
  415.     boolean readEndOfLine(final int ch) throws IOException {
  416.         // check if we have \r\n...
  417.         int cur = ch;
  418.         if (cur == Constants.CR && reader.peek() == Constants.LF) {
  419.             // note: does not change ch outside of this method!
  420.             cur = reader.read();
  421.             // Save the EOL state
  422.             if (firstEol == null) {
  423.                 this.firstEol = Constants.CRLF;
  424.             }
  425.         }
  426.         // save EOL state here.
  427.         if (firstEol == null) {
  428.             if (cur == Constants.LF) {
  429.                 this.firstEol = LF_STRING;
  430.             } else if (cur == Constants.CR) {
  431.                 this.firstEol = CR_STRING;
  432.             }
  433.         }

  434.         return cur == Constants.LF || cur == Constants.CR;
  435.     }

  436.     // TODO escape handling needs more work
  437.     /**
  438.      * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
  439.      * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
  440.      *
  441.      * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
  442.      * @throws IOException  if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
  443.      * @throws CSVException Thrown on invalid input.
  444.      */
  445.     int readEscape() throws IOException {
  446.         // the escape char has just been read (normally a backslash)
  447.         final int ch = reader.read();
  448.         switch (ch) {
  449.         case 'r':
  450.             return Constants.CR;
  451.         case 'n':
  452.             return Constants.LF;
  453.         case 't':
  454.             return Constants.TAB;
  455.         case 'b':
  456.             return Constants.BACKSPACE;
  457.         case 'f':
  458.             return Constants.FF;
  459.         case Constants.CR:
  460.         case Constants.LF:
  461.         case Constants.FF: // TODO is this correct?
  462.         case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
  463.         case Constants.BACKSPACE: // TODO is this correct?
  464.             return ch;
  465.         case EOF:
  466.             throw new CSVException("EOF while processing escape sequence");
  467.         default:
  468.             // Now check for meta-characters
  469.             if (isMetaChar(ch)) {
  470.                 return ch;
  471.             }
  472.             // indicate unexpected char - available from in.getLastChar()
  473.             return EOF;
  474.         }
  475.     }

  476.     void trimTrailingSpaces(final StringBuilder buffer) {
  477.         int length = buffer.length();
  478.         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
  479.             length--;
  480.         }
  481.         if (length != buffer.length()) {
  482.             buffer.setLength(length);
  483.         }
  484.     }
  485. }