View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Constants.BACKSPACE;
21  import static org.apache.commons.csv.Constants.CR;
22  import static org.apache.commons.csv.Constants.END_OF_STREAM;
23  import static org.apache.commons.csv.Constants.FF;
24  import static org.apache.commons.csv.Constants.LF;
25  import static org.apache.commons.csv.Constants.TAB;
26  import static org.apache.commons.csv.Constants.UNDEFINED;
27  import static org.apache.commons.csv.Token.Type.COMMENT;
28  import static org.apache.commons.csv.Token.Type.EOF;
29  import static org.apache.commons.csv.Token.Type.EORECORD;
30  import static org.apache.commons.csv.Token.Type.INVALID;
31  import static org.apache.commons.csv.Token.Type.TOKEN;
32  
33  import java.io.Closeable;
34  import java.io.IOException;
35  
36  /**
37   * Lexical analyzer.
38   */
39  final class Lexer implements Closeable {
40  
41      private static final String CR_STRING = Character.toString(CR);
42      private static final String LF_STRING = Character.toString(LF);
43  
44      /**
45       * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
46       * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
47       * chars (using surrogates) and thus there should never be a collision with a real text char.
48       */
49      private static final char DISABLED = '\ufffe';
50  
51      private final char[] delimiter;
52      private final char[] delimiterBuf;
53      private final char[] escapeDelimiterBuf;
54      private final char escape;
55      private final char quoteChar;
56      private final char commentStart;
57  
58      private final boolean ignoreSurroundingSpaces;
59      private final boolean ignoreEmptyLines;
60  
61      /** The input stream */
62      private final ExtendedBufferedReader reader;
63      private String firstEol;
64  
65      private boolean isLastTokenDelimiter;
66  
67      Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
68          this.reader = reader;
69          this.delimiter = format.getDelimiterString().toCharArray();
70          this.escape = mapNullToDisabled(format.getEscapeCharacter());
71          this.quoteChar = mapNullToDisabled(format.getQuoteCharacter());
72          this.commentStart = mapNullToDisabled(format.getCommentMarker());
73          this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
74          this.ignoreEmptyLines = format.getIgnoreEmptyLines();
75          this.delimiterBuf = new char[delimiter.length - 1];
76          this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
77      }
78  
79      /**
80       * Closes resources.
81       *
82       * @throws IOException
83       *             If an I/O error occurs
84       */
85      @Override
86      public void close() throws IOException {
87          reader.close();
88      }
89  
90      /**
91       * Returns the current character position
92       *
93       * @return the current character position
94       */
95      long getCharacterPosition() {
96          return reader.getPosition();
97      }
98  
99      /**
100      * Returns the current line number
101      *
102      * @return the current line number
103      */
104     long getCurrentLineNumber() {
105         return reader.getCurrentLineNumber();
106     }
107 
108     String getFirstEol(){
109         return firstEol;
110     }
111 
112     boolean isClosed() {
113         return reader.isClosed();
114     }
115 
116     boolean isCommentStart(final int ch) {
117         return ch == commentStart;
118     }
119 
120     /**
121      * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#lookAhead(char[])}.
122      *
123      * @param ch
124      *             the current character.
125      * @return true if the next characters constitute a delimiter.
126      * @throws IOException If an I/O error occurs.
127      */
128     boolean isDelimiter(final int ch) throws IOException {
129         isLastTokenDelimiter = false;
130         if (ch != delimiter[0]) {
131             return false;
132         }
133         if (delimiter.length == 1) {
134             isLastTokenDelimiter = true;
135             return true;
136         }
137         reader.lookAhead(delimiterBuf);
138         for (int i = 0; i < delimiterBuf.length; i++) {
139             if (delimiterBuf[i] != delimiter[i+1]) {
140                 return false;
141             }
142         }
143         final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
144         isLastTokenDelimiter = count != END_OF_STREAM;
145         return isLastTokenDelimiter;
146     }
147 
148     /**
149      * Tests if the given character indicates end of file.
150      *
151      * @return true if the given character indicates end of file.
152      */
153     boolean isEndOfFile(final int ch) {
154         return ch == END_OF_STREAM;
155     }
156 
157     /**
158      * Tests if the given character is the escape character.
159      *
160      * @return true if the given character is the escape character.
161      */
162     boolean isEscape(final int ch) {
163         return ch == escape;
164     }
165 
166     /**
167      * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#lookAhead(char[])}.
168      *
169      * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
170      *
171      * @return true if the next characters constitute a escape delimiter.
172      * @throws IOException If an I/O error occurs.
173      */
174     boolean isEscapeDelimiter() throws IOException {
175         reader.lookAhead(escapeDelimiterBuf);
176         if (escapeDelimiterBuf[0] != delimiter[0]) {
177             return false;
178         }
179         for (int i = 1; i < delimiter.length; i++) {
180             if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
181                 return false;
182             }
183         }
184         final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
185         return count != END_OF_STREAM;
186     }
187 
188     private boolean isMetaChar(final int ch) {
189         return ch == escape || ch == quoteChar || ch == commentStart;
190     }
191 
192     boolean isQuoteChar(final int ch) {
193         return ch == quoteChar;
194     }
195 
196     /**
197      * Tests if the current character represents the start of a line: a CR, LF or is at the start of the file.
198      *
199      * @param ch the character to check
200      * @return true if the character is at the start of a line.
201      */
202     boolean isStartOfLine(final int ch) {
203         return ch == LF || ch == CR || ch == UNDEFINED;
204     }
205 
206     private char mapNullToDisabled(final Character c) {
207         return c == null ? DISABLED : c.charValue();
208     }
209 
210     /**
211      * Returns the next token.
212      * <p>
213      * A token corresponds to a term, a record change or an end-of-file indicator.
214      * </p>
215      *
216      * @param token
217      *            an existing Token object to reuse. The caller is responsible to initialize the Token.
218      * @return the next token found.
219      * @throws IOException on stream access error.
220      */
221     Token nextToken(final Token token) throws IOException {
222 
223         // get the last read char (required for empty line detection)
224         int lastChar = reader.getLastChar();
225 
226         // read the next char and set eol
227         int c = reader.read();
228         /*
229          * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF
230          * - they are equivalent here.
231          */
232         boolean eol = readEndOfLine(c);
233 
234         // empty line detection: eol AND (last char was EOL or beginning)
235         if (ignoreEmptyLines) {
236             while (eol && isStartOfLine(lastChar)) {
237                 // go on char ahead ...
238                 lastChar = c;
239                 c = reader.read();
240                 eol = readEndOfLine(c);
241                 // reached end of file without any content (empty line at the end)
242                 if (isEndOfFile(c)) {
243                     token.type = EOF;
244                     // don't set token.isReady here because no content
245                     return token;
246                 }
247             }
248         }
249 
250         // did we reach eof during the last iteration already ? EOF
251         if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
252             token.type = EOF;
253             // don't set token.isReady here because no content
254             return token;
255         }
256 
257         if (isStartOfLine(lastChar) && isCommentStart(c)) {
258             final String line = reader.readLine();
259             if (line == null) {
260                 token.type = EOF;
261                 // don't set token.isReady here because no content
262                 return token;
263             }
264             final String comment = line.trim();
265             token.content.append(comment);
266             token.type = COMMENT;
267             return token;
268         }
269 
270         // important: make sure a new char gets consumed in each iteration
271         while (token.type == INVALID) {
272             // ignore whitespaces at beginning of a token
273             if (ignoreSurroundingSpaces) {
274                 while (Character.isWhitespace((char)c) && !isDelimiter(c) && !eol) {
275                     c = reader.read();
276                     eol = readEndOfLine(c);
277                 }
278             }
279 
280             // ok, start of token reached: encapsulated, or token
281             if (isDelimiter(c)) {
282                 // empty token return TOKEN("")
283                 token.type = TOKEN;
284             } else if (eol) {
285                 // empty token return EORECORD("")
286                 // noop: token.content.append("");
287                 token.type = EORECORD;
288             } else if (isQuoteChar(c)) {
289                 // consume encapsulated token
290                 parseEncapsulatedToken(token);
291             } else if (isEndOfFile(c)) {
292                 // end of file return EOF()
293                 // noop: token.content.append("");
294                 token.type = EOF;
295                 token.isReady = true; // there is data at EOF
296             } else {
297                 // next token must be a simple token
298                 // add removed blanks when not ignoring whitespace chars...
299                 parseSimpleToken(token, c);
300             }
301         }
302         return token;
303     }
304 
305     /**
306      * Parses an encapsulated token.
307      * <p>
308      * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included
309      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
310      * an encapsulated token are ignored. The token is finished when one of the following conditions become true:
311      * </p>
312      * <ul>
313      * <li>an unescaped encapsulator has been reached, and is followed by optional whitespace then:</li>
314      * <ul>
315      * <li>delimiter (TOKEN)</li>
316      * <li>end of line (EORECORD)</li>
317      * </ul>
318      * <li>end of stream has been reached (EOF)</li> </ul>
319      *
320      * @param token
321      *            the current token
322      * @return a valid token object
323      * @throws IOException
324      *             on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL
325      */
326     private Token parseEncapsulatedToken(final Token token) throws IOException {
327         token.isQuoted = true;
328         // save current line number in case needed for IOE
329         final long startLineNumber = getCurrentLineNumber();
330         int c;
331         while (true) {
332             c = reader.read();
333 
334             if (isEscape(c)) {
335                 if (isEscapeDelimiter()) {
336                     token.content.append(delimiter);
337                 } else {
338                     final int unescaped = readEscape();
339                     if (unescaped == END_OF_STREAM) { // unexpected char after escape
340                         token.content.append((char) c).append((char) reader.getLastChar());
341                     } else {
342                         token.content.append((char) unescaped);
343                     }
344                 }
345             } else if (isQuoteChar(c)) {
346                 if (isQuoteChar(reader.lookAhead())) {
347                     // double or escaped encapsulator -> add single encapsulator to token
348                     c = reader.read();
349                     token.content.append((char) c);
350                 } else {
351                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
352                     while (true) {
353                         c = reader.read();
354                         if (isDelimiter(c)) {
355                             token.type = TOKEN;
356                             return token;
357                         }
358                         if (isEndOfFile(c)) {
359                             token.type = EOF;
360                             token.isReady = true; // There is data at EOF
361                             return token;
362                         }
363                         if (readEndOfLine(c)) {
364                             token.type = EORECORD;
365                             return token;
366                         }
367                         if (!Character.isWhitespace((char)c)) {
368                             // error invalid char between token and next delimiter
369                             throw new IOException("(line " + getCurrentLineNumber() +
370                                     ") invalid char between encapsulated token and delimiter");
371                         }
372                     }
373                 }
374             } else if (isEndOfFile(c)) {
375                 // error condition (end of file before end of token)
376                 throw new IOException("(startline " + startLineNumber +
377                         ") EOF reached before encapsulated token finished");
378             } else {
379                 // consume character
380                 token.content.append((char) c);
381             }
382         }
383     }
384 
385     /**
386      * Parses a simple token.
387      * <p>
388      * Simple token are tokens which are not surrounded by encapsulators. A simple token might contain escaped
389      * delimiters (as \, or \;). The token is finished when one of the following conditions become true:
390      * </p>
391      * <ul>
392      * <li>end of line has been reached (EORECORD)</li>
393      * <li>end of stream has been reached (EOF)</li>
394      * <li>an unescaped delimiter has been reached (TOKEN)</li>
395      * </ul>
396      *
397      * @param token
398      *            the current token
399      * @param ch
400      *            the current character
401      * @return the filled token
402      * @throws IOException
403      *             on stream access error
404      */
405     private Token parseSimpleToken(final Token token, int ch) throws IOException {
406         // Faster to use while(true)+break than while(token.type == INVALID)
407         while (true) {
408             if (readEndOfLine(ch)) {
409                 token.type = EORECORD;
410                 break;
411             }
412             if (isEndOfFile(ch)) {
413                 token.type = EOF;
414                 token.isReady = true; // There is data at EOF
415                 break;
416             }
417             if (isDelimiter(ch)) {
418                 token.type = TOKEN;
419                 break;
420             }
421             // continue
422             if (isEscape(ch)) {
423                 if (isEscapeDelimiter()) {
424                     token.content.append(delimiter);
425                 } else {
426                     final int unescaped = readEscape();
427                     if (unescaped == END_OF_STREAM) { // unexpected char after escape
428                         token.content.append((char) ch).append((char) reader.getLastChar());
429                     } else {
430                         token.content.append((char) unescaped);
431                     }
432                 }
433             } else {
434                 token.content.append((char) ch);
435             }
436             ch = reader.read(); // continue
437         }
438 
439         if (ignoreSurroundingSpaces) {
440             trimTrailingSpaces(token.content);
441         }
442 
443         return token;
444     }
445 
446     /**
447      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
448      *
449      * @return true if the given or next character is a line-terminator
450      */
451     boolean readEndOfLine(int ch) throws IOException {
452         // check if we have \r\n...
453         if (ch == CR && reader.lookAhead() == LF) {
454             // note: does not change ch outside of this method!
455             ch = reader.read();
456             // Save the EOL state
457             if (firstEol == null) {
458                 this.firstEol = Constants.CRLF;
459             }
460         }
461         // save EOL state here.
462         if (firstEol == null) {
463             if (ch == LF) {
464                 this.firstEol = LF_STRING;
465             } else if (ch == CR) {
466                 this.firstEol = CR_STRING;
467             }
468         }
469 
470         return ch == LF || ch == CR;
471     }
472 
473     // TODO escape handling needs more work
474     /**
475      * Handle an escape sequence.
476      * The current character must be the escape character.
477      * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
478      * on the input stream.
479      *
480      * @return the unescaped character (as an int) or {@link Constants#END_OF_STREAM} if char following the escape is
481      *      invalid.
482      * @throws IOException if there is a problem reading the stream or the end of stream is detected:
483      *      the escape character is not allowed at end of stream
484      */
485     int readEscape() throws IOException {
486         // the escape char has just been read (normally a backslash)
487         final int ch = reader.read();
488         switch (ch) {
489         case 'r':
490             return CR;
491         case 'n':
492             return LF;
493         case 't':
494             return TAB;
495         case 'b':
496             return BACKSPACE;
497         case 'f':
498             return FF;
499         case CR:
500         case LF:
501         case FF: // TODO is this correct?
502         case TAB: // TODO is this correct? Do tabs need to be escaped?
503         case BACKSPACE: // TODO is this correct?
504             return ch;
505         case END_OF_STREAM:
506             throw new IOException("EOF whilst processing escape sequence");
507         default:
508             // Now check for meta-characters
509             if (isMetaChar(ch)) {
510                 return ch;
511             }
512             // indicate unexpected char - available from in.getLastChar()
513             return END_OF_STREAM;
514         }
515     }
516 
517     void trimTrailingSpaces(final StringBuilder buffer) {
518         int length = buffer.length();
519         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
520             length = length - 1;
521         }
522         if (length != buffer.length()) {
523             buffer.setLength(length);
524         }
525     }
526 }