View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.io.IOUtils.EOF;
23  
24  import java.io.Closeable;
25  import java.io.IOException;
26  
27  import org.apache.commons.io.IOUtils;
28  
29  /**
30   * Lexical analyzer.
31   */
32  final class Lexer implements Closeable {
33  
34      private static final String CR_STRING = Character.toString(Constants.CR);
35      private static final String LF_STRING = Character.toString(Constants.LF);
36  
37      private final char[] delimiter;
38      private final char[] delimiterBuf;
39      private final char[] escapeDelimiterBuf;
40      private final int escape;
41      private final int quoteChar;
42      private final int commentStart;
43      private final boolean ignoreSurroundingSpaces;
44      private final boolean ignoreEmptyLines;
45      private final boolean lenientEof;
46      private final boolean trailingData;
47  
48      /** The buffered reader. */
49      private final ExtendedBufferedReader reader;
50      private String firstEol;
51  
52      private boolean isLastTokenDelimiter;
53  
54      Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
55          this.reader = reader;
56          this.delimiter = format.getDelimiterCharArray();
57          this.escape = nullToDisabled(format.getEscapeCharacter());
58          this.quoteChar = nullToDisabled(format.getQuoteCharacter());
59          this.commentStart = nullToDisabled(format.getCommentMarker());
60          this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
61          this.ignoreEmptyLines = format.getIgnoreEmptyLines();
62          this.lenientEof = format.getLenientEof();
63          this.trailingData = format.getTrailingData();
64          this.delimiterBuf = new char[delimiter.length - 1];
65          this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
66      }
67  
68      /**
69       * Appends the next escaped character to the token's content.
70       *
71       * @param token the current token
72       * @throws IOException  on stream access error
73       * @throws CSVException Thrown on invalid input.
74       */
75      private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
76          if (isEscapeDelimiter()) {
77              token.content.append(delimiter);
78          } else {
79              final int unescaped = readEscape();
80              if (unescaped == EOF) { // unexpected char after escape
81                  token.content.append((char) escape).append((char) reader.getLastChar());
82              } else {
83                  token.content.append((char) unescaped);
84              }
85          }
86      }
87  
88      /**
89       * Closes resources.
90       *
91       * @throws IOException
92       *             If an I/O error occurs
93       */
94      @Override
95      public void close() throws IOException {
96          reader.close();
97      }
98  
99      /**
100      * Gets the number of bytes read
101      *
102      * @return the number of bytes read
103      */
104     long getBytesRead() {
105         return reader.getBytesRead();
106     }
107 
108     /**
109      * Returns the current character position
110      *
111      * @return the current character position
112      */
113     long getCharacterPosition() {
114         return reader.getPosition();
115     }
116 
117     /**
118      * Returns the current line number
119      *
120      * @return the current line number
121      */
122     long getCurrentLineNumber() {
123         return reader.getLineNumber();
124     }
125 
126     String getFirstEol() {
127         return firstEol;
128     }
129 
130     boolean isClosed() {
131         return reader.isClosed();
132     }
133 
134     boolean isCommentStart(final int ch) {
135         return ch == commentStart;
136     }
137 
138     /**
139      * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
140      *
141      * @param ch
142      *             the current character.
143      * @return true if the next characters constitute a delimiter.
144      * @throws IOException If an I/O error occurs.
145      */
146     boolean isDelimiter(final int ch) throws IOException {
147         isLastTokenDelimiter = false;
148         if (ch != delimiter[0]) {
149             return false;
150         }
151         if (delimiter.length == 1) {
152             isLastTokenDelimiter = true;
153             return true;
154         }
155         reader.peek(delimiterBuf);
156         for (int i = 0; i < delimiterBuf.length; i++) {
157             if (delimiterBuf[i] != delimiter[i + 1]) {
158                 return false;
159             }
160         }
161         final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
162         isLastTokenDelimiter = count != EOF;
163         return isLastTokenDelimiter;
164     }
165 
166     /**
167      * Tests if the given character indicates the end of the file.
168      *
169      * @return true if the given character indicates the end of the file.
170      */
171     boolean isEndOfFile(final int ch) {
172         return ch == EOF;
173     }
174 
175     /**
176      * Tests if the given character is the escape character.
177      *
178      * @return true if the given character is the escape character.
179      */
180     boolean isEscape(final int ch) {
181         return ch == escape;
182     }
183 
184     /**
185      * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
186      *
187      * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
188      *
189      * @return true if the next characters constitute an escape delimiter.
190      * @throws IOException If an I/O error occurs.
191      */
192     boolean isEscapeDelimiter() throws IOException {
193         reader.peek(escapeDelimiterBuf);
194         if (escapeDelimiterBuf[0] != delimiter[0]) {
195             return false;
196         }
197         for (int i = 1; i < delimiter.length; i++) {
198             if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
199                 return false;
200             }
201         }
202         final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
203         return count != EOF;
204     }
205 
206     private boolean isMetaChar(final int ch) {
207         return ch == escape || ch == quoteChar || ch == commentStart;
208     }
209 
210     boolean isQuoteChar(final int ch) {
211         return ch == quoteChar;
212     }
213 
214     /**
215      * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
216      *
217      * @param ch the character to check
218      * @return true if the character is at the start of a line.
219      */
220     boolean isStartOfLine(final int ch) {
221         return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
222     }
223 
224     /**
225      * Returns the next token.
226      * <p>
227      * A token corresponds to a term, a record change or an end-of-file indicator.
228      * </p>
229      *
230      * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
231      * @return the next token found.
232      * @throws IOException  on stream access error.
233      * @throws CSVException Thrown on invalid input.
234      */
235     Token nextToken(final Token token) throws IOException {
236         // Get the last read char (required for empty line detection)
237         int lastChar = reader.getLastChar();
238         // read the next char and set eol
239         int c = reader.read();
240         // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
241         boolean eol = readEndOfLine(c);
242         // empty line detection: eol AND (last char was EOL or beginning)
243         if (ignoreEmptyLines) {
244             while (eol && isStartOfLine(lastChar)) {
245                 // Go on char ahead ...
246                 lastChar = c;
247                 c = reader.read();
248                 eol = readEndOfLine(c);
249                 // reached the end of the file without any content (empty line at the end)
250                 if (isEndOfFile(c)) {
251                     token.type = Token.Type.EOF;
252                     // don't set token.isReady here because no content
253                     return token;
254                 }
255             }
256         }
257         // Did we reach EOF during the last iteration already? EOF
258         if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
259             token.type = Token.Type.EOF;
260             // don't set token.isReady here because no content
261             return token;
262         }
263         if (isStartOfLine(lastChar) && isCommentStart(c)) {
264             final String line = reader.readLine();
265             if (line == null) {
266                 token.type = Token.Type.EOF;
267                 // don't set token.isReady here because no content
268                 return token;
269             }
270             final String comment = line.trim();
271             token.content.append(comment);
272             token.type = Token.Type.COMMENT;
273             return token;
274         }
275         // Important: make sure a new char gets consumed in each iteration
276         while (token.type == Token.Type.INVALID) {
277             // ignore whitespaces at beginning of a token
278             if (ignoreSurroundingSpaces) {
279                 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
280                     c = reader.read();
281                     eol = readEndOfLine(c);
282                 }
283             }
284             // ok, start of token reached: encapsulated, or token
285             if (isDelimiter(c)) {
286                 // empty token return TOKEN("")
287                 token.type = Token.Type.TOKEN;
288             } else if (eol) {
289                 // empty token return EORECORD("")
290                 // noop: token.content.append("");
291                 token.type = Token.Type.EORECORD;
292             } else if (isQuoteChar(c)) {
293                 // consume encapsulated token
294                 parseEncapsulatedToken(token);
295             } else if (isEndOfFile(c)) {
296                 // end of file return EOF()
297                 // noop: token.content.append("");
298                 token.type = Token.Type.EOF;
299                 token.isReady = true; // there is data at EOF
300             } else {
301                 // next token must be a simple token
302                 // add removed blanks when not ignoring whitespace chars...
303                 parseSimpleToken(token, c);
304             }
305         }
306         return token;
307     }
308 
309     private int nullToDisabled(final Character c) {
310         return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
311     }
312 
313     /**
314      * Parses an encapsulated token.
315      * <p>
316      * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
317      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
318      * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
319      * </p>
320      * <ul>
321      * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
322      * <ul>
323      * <li>delimiter (TOKEN)</li>
324      * <li>end of line (EORECORD)</li>
325      * </ul>
326      * <li>end of stream has been reached (EOF)</li> </ul>
327      *
328      * @param token
329      *            the current token
330      * @return a valid token object
331      * @throws IOException
332      *             Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
333      *             delimiter or EOL.
334      * @throws CSVException Thrown on invalid input.
335      */
336     private Token parseEncapsulatedToken(final Token token) throws IOException {
337         token.isQuoted = true;
338         // Save current line number in case needed for IOE
339         final long startLineNumber = getCurrentLineNumber();
340         int c;
341         while (true) {
342             c = reader.read();
343 
344             if (isQuoteChar(c)) {
345                 if (isQuoteChar(reader.peek())) {
346                     // double or escaped encapsulator -> add single encapsulator to token
347                     c = reader.read();
348                     token.content.append((char) c);
349                 } else {
350                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
351                     while (true) {
352                         c = reader.read();
353                         if (isDelimiter(c)) {
354                             token.type = Token.Type.TOKEN;
355                             return token;
356                         }
357                         if (isEndOfFile(c)) {
358                             token.type = Token.Type.EOF;
359                             token.isReady = true; // There is data at EOF
360                             return token;
361                         }
362                         if (readEndOfLine(c)) {
363                             token.type = Token.Type.EORECORD;
364                             return token;
365                         }
366                         if (trailingData) {
367                             token.content.append((char) c);
368                         } else if (!Character.isWhitespace((char) c)) {
369                             // error invalid char between token and next delimiter
370                             throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
371                                     getCurrentLineNumber(), getCharacterPosition());
372                         }
373                     }
374                 }
375             } else if (isEscape(c)) {
376                 appendNextEscapedCharacterToToken(token);
377             } else if (isEndOfFile(c)) {
378                 if (lenientEof) {
379                     token.type = Token.Type.EOF;
380                     token.isReady = true; // There is data at EOF
381                     return token;
382                 }
383                 // error condition (end of file before end of token)
384                 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
385             } else {
386                 // consume character
387                 token.content.append((char) c);
388             }
389         }
390     }
391 
392     /**
393      * Parses a simple token.
394      * <p>
395      * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
396      * when one of the following conditions becomes true:
397      * </p>
398      * <ul>
399      * <li>The end of line has been reached (EORECORD)</li>
400      * <li>The end of stream has been reached (EOF)</li>
401      * <li>An unescaped delimiter has been reached (TOKEN)</li>
402      * </ul>
403      *
404      * @param token the current token
405      * @param ch     the current character
406      * @return the filled token
407      * @throws IOException  on stream access error
408      * @throws CSVException Thrown on invalid input.
409      */
410     private Token parseSimpleToken(final Token token, final int ch) throws IOException {
411         // Faster to use while(true)+break than while(token.type == INVALID)
412         int cur = ch;
413         while (true) {
414             if (readEndOfLine(cur)) {
415                 token.type = Token.Type.EORECORD;
416                 break;
417             }
418             if (isEndOfFile(cur)) {
419                 token.type = Token.Type.EOF;
420                 token.isReady = true; // There is data at EOF
421                 break;
422             }
423             if (isDelimiter(cur)) {
424                 token.type = Token.Type.TOKEN;
425                 break;
426             }
427             // continue
428             if (isEscape(cur)) {
429                 appendNextEscapedCharacterToToken(token);
430             } else {
431                 token.content.append((char) cur);
432             }
433             cur = reader.read(); // continue
434         }
435 
436         if (ignoreSurroundingSpaces) {
437             trimTrailingSpaces(token.content);
438         }
439 
440         return token;
441     }
442 
443     /**
444      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
445      *
446      * @return true if the given or next character is a line-terminator
447      */
448     boolean readEndOfLine(final int ch) throws IOException {
449         // check if we have \r\n...
450         int cur = ch;
451         if (cur == Constants.CR && reader.peek() == Constants.LF) {
452             // note: does not change ch outside of this method!
453             cur = reader.read();
454             // Save the EOL state
455             if (firstEol == null) {
456                 this.firstEol = Constants.CRLF;
457             }
458         }
459         // save EOL state here.
460         if (firstEol == null) {
461             if (cur == Constants.LF) {
462                 this.firstEol = LF_STRING;
463             } else if (cur == Constants.CR) {
464                 this.firstEol = CR_STRING;
465             }
466         }
467 
468         return cur == Constants.LF || cur == Constants.CR;
469     }
470 
471     // TODO escape handling needs more work
472     /**
473      * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
474      * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
475      *
476      * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
477      * @throws IOException  if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
478      * @throws CSVException Thrown on invalid input.
479      */
480     int readEscape() throws IOException {
481         // the escape char has just been read (normally a backslash)
482         final int ch = reader.read();
483         switch (ch) {
484         case 'r':
485             return Constants.CR;
486         case 'n':
487             return Constants.LF;
488         case 't':
489             return Constants.TAB;
490         case 'b':
491             return Constants.BACKSPACE;
492         case 'f':
493             return Constants.FF;
494         case Constants.CR:
495         case Constants.LF:
496         case Constants.FF: // TODO is this correct?
497         case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
498         case Constants.BACKSPACE: // TODO is this correct?
499             return ch;
500         case EOF:
501             throw new CSVException("EOF while processing escape sequence");
502         default:
503             // Now check for meta-characters
504             if (isMetaChar(ch)) {
505                 return ch;
506             }
507             // indicate unexpected char - available from in.getLastChar()
508             return EOF;
509         }
510     }
511 
512     void trimTrailingSpaces(final StringBuilder buffer) {
513         int length = buffer.length();
514         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
515             length--;
516         }
517         if (length != buffer.length()) {
518             buffer.setLength(length);
519         }
520     }
521 }