View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Constants.BACKSPACE;
21  import static org.apache.commons.csv.Constants.CR;
22  import static org.apache.commons.csv.Constants.FF;
23  import static org.apache.commons.csv.Constants.LF;
24  import static org.apache.commons.csv.Constants.TAB;
25  import static org.apache.commons.csv.Constants.UNDEFINED;
26  import static org.apache.commons.csv.Token.Type.COMMENT;
27  import static org.apache.commons.csv.Token.Type.EORECORD;
28  import static org.apache.commons.csv.Token.Type.INVALID;
29  import static org.apache.commons.csv.Token.Type.TOKEN;
30  import static org.apache.commons.io.IOUtils.EOF;
31  
32  import java.io.Closeable;
33  import java.io.IOException;
34  
35  /**
36   * Lexical analyzer.
37   */
38  final class Lexer implements Closeable {
39  
40      private static final String CR_STRING = Character.toString(CR);
41      private static final String LF_STRING = Character.toString(LF);
42  
43      /**
44       * Constant char to use for disabling comments, escapes, and encapsulation. The value -2 is used because it
45       * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
46       * chars (using surrogates) and thus there should never be a collision with a real text char.
47       */
48      private static final char DISABLED = '\ufffe';
49  
50      private final char[] delimiter;
51      private final char[] delimiterBuf;
52      private final char[] escapeDelimiterBuf;
53      private final char escape;
54      private final char quoteChar;
55      private final char commentStart;
56      private final boolean ignoreSurroundingSpaces;
57      private final boolean ignoreEmptyLines;
58      private final boolean lenientEof;
59      private final boolean trailingData;
60  
61      /** The input stream */
62      private final ExtendedBufferedReader reader;
63      private String firstEol;
64  
65      private boolean isLastTokenDelimiter;
66  
67      Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
68          this.reader = reader;
69          this.delimiter = format.getDelimiterCharArray();
70          this.escape = mapNullToDisabled(format.getEscapeCharacter());
71          this.quoteChar = mapNullToDisabled(format.getQuoteCharacter());
72          this.commentStart = mapNullToDisabled(format.getCommentMarker());
73          this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
74          this.ignoreEmptyLines = format.getIgnoreEmptyLines();
75          this.lenientEof = format.getLenientEof();
76          this.trailingData = format.getTrailingData();
77          this.delimiterBuf = new char[delimiter.length - 1];
78          this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
79      }
80  
81      /**
82       * Closes resources.
83       *
84       * @throws IOException
85       *             If an I/O error occurs
86       */
87      @Override
88      public void close() throws IOException {
89          reader.close();
90      }
91  
92      /**
93       * Returns the current character position
94       *
95       * @return the current character position
96       */
97      long getCharacterPosition() {
98          return reader.getPosition();
99      }
100 
101     /**
102      * Returns the current line number
103      *
104      * @return the current line number
105      */
106     long getCurrentLineNumber() {
107         return reader.getCurrentLineNumber();
108     }
109 
110     String getFirstEol(){
111         return firstEol;
112     }
113 
114     boolean isClosed() {
115         return reader.isClosed();
116     }
117 
118     boolean isCommentStart(final int ch) {
119         return ch == commentStart;
120     }
121 
122     /**
123      * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#lookAhead(char[])}.
124      *
125      * @param ch
126      *             the current character.
127      * @return true if the next characters constitute a delimiter.
128      * @throws IOException If an I/O error occurs.
129      */
130     boolean isDelimiter(final int ch) throws IOException {
131         isLastTokenDelimiter = false;
132         if (ch != delimiter[0]) {
133             return false;
134         }
135         if (delimiter.length == 1) {
136             isLastTokenDelimiter = true;
137             return true;
138         }
139         reader.lookAhead(delimiterBuf);
140         for (int i = 0; i < delimiterBuf.length; i++) {
141             if (delimiterBuf[i] != delimiter[i+1]) {
142                 return false;
143             }
144         }
145         final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
146         isLastTokenDelimiter = count != EOF;
147         return isLastTokenDelimiter;
148     }
149 
150     /**
151      * Tests if the given character indicates the end of the file.
152      *
153      * @return true if the given character indicates the end of the file.
154      */
155     boolean isEndOfFile(final int ch) {
156         return ch == EOF;
157     }
158 
159     /**
160      * Tests if the given character is the escape character.
161      *
162      * @return true if the given character is the escape character.
163      */
164     boolean isEscape(final int ch) {
165         return ch == escape;
166     }
167 
168     /**
169      * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#lookAhead(char[])}.
170      *
171      * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
172      *
173      * @return true if the next characters constitute an escape delimiter.
174      * @throws IOException If an I/O error occurs.
175      */
176     boolean isEscapeDelimiter() throws IOException {
177         reader.lookAhead(escapeDelimiterBuf);
178         if (escapeDelimiterBuf[0] != delimiter[0]) {
179             return false;
180         }
181         for (int i = 1; i < delimiter.length; i++) {
182             if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
183                 return false;
184             }
185         }
186         final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
187         return count != EOF;
188     }
189 
190     private boolean isMetaChar(final int ch) {
191         return ch == escape || ch == quoteChar || ch == commentStart;
192     }
193 
194     boolean isQuoteChar(final int ch) {
195         return ch == quoteChar;
196     }
197 
198     /**
199      * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
200      *
201      * @param ch the character to check
202      * @return true if the character is at the start of a line.
203      */
204     boolean isStartOfLine(final int ch) {
205         return ch == LF || ch == CR || ch == UNDEFINED;
206     }
207 
208     private char mapNullToDisabled(final Character c) {
209         return c == null ? DISABLED : c.charValue();
210     }
211 
212     /**
213      * Returns the next token.
214      * <p>
215      * A token corresponds to a term, a record change or an end-of-file indicator.
216      * </p>
217      *
218      * @param token
219      *            an existing Token object to reuse. The caller is responsible for initializing the Token.
220      * @return the next token found.
221      * @throws IOException on stream access error.
222      */
223     Token nextToken(final Token token) throws IOException {
224 
225         // Get the last read char (required for empty line detection)
226         int lastChar = reader.getLastChar();
227 
228         // read the next char and set eol
229         int c = reader.read();
230         /*
231          * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF
232          * - they are equivalent here.
233          */
234         boolean eol = readEndOfLine(c);
235 
236         // empty line detection: eol AND (last char was EOL or beginning)
237         if (ignoreEmptyLines) {
238             while (eol && isStartOfLine(lastChar)) {
239                 // Go on char ahead ...
240                 lastChar = c;
241                 c = reader.read();
242                 eol = readEndOfLine(c);
243                 // reached the end of the file without any content (empty line at the end)
244                 if (isEndOfFile(c)) {
245                     token.type = Token.Type.EOF;
246                     // don't set token.isReady here because no content
247                     return token;
248                 }
249             }
250         }
251 
252         // Did we reach EOF during the last iteration already? EOF
253         if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
254             token.type = Token.Type.EOF;
255             // don't set token.isReady here because no content
256             return token;
257         }
258 
259         if (isStartOfLine(lastChar) && isCommentStart(c)) {
260             final String line = reader.readLine();
261             if (line == null) {
262                 token.type = Token.Type.EOF;
263                 // don't set token.isReady here because no content
264                 return token;
265             }
266             final String comment = line.trim();
267             token.content.append(comment);
268             token.type = COMMENT;
269             return token;
270         }
271 
272         // Important: make sure a new char gets consumed in each iteration
273         while (token.type == INVALID) {
274             // ignore whitespaces at beginning of a token
275             if (ignoreSurroundingSpaces) {
276                 while (Character.isWhitespace((char)c) && !isDelimiter(c) && !eol) {
277                     c = reader.read();
278                     eol = readEndOfLine(c);
279                 }
280             }
281 
282             // ok, start of token reached: encapsulated, or token
283             if (isDelimiter(c)) {
284                 // empty token return TOKEN("")
285                 token.type = TOKEN;
286             } else if (eol) {
287                 // empty token return EORECORD("")
288                 // noop: token.content.append("");
289                 token.type = EORECORD;
290             } else if (isQuoteChar(c)) {
291                 // consume encapsulated token
292                 parseEncapsulatedToken(token);
293             } else if (isEndOfFile(c)) {
294                 // end of file return EOF()
295                 // noop: token.content.append("");
296                 token.type = Token.Type.EOF;
297                 token.isReady = true; // there is data at EOF
298             } else {
299                 // next token must be a simple token
300                 // add removed blanks when not ignoring whitespace chars...
301                 parseSimpleToken(token, c);
302             }
303         }
304         return token;
305     }
306 
307     /**
308      * Parses an encapsulated token.
309      * <p>
310      * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
311      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
312      * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
313      * </p>
314      * <ul>
315      * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
316      * <ul>
317      * <li>delimiter (TOKEN)</li>
318      * <li>end of line (EORECORD)</li>
319      * </ul>
320      * <li>end of stream has been reached (EOF)</li> </ul>
321      *
322      * @param token
323      *            the current token
324      * @return a valid token object
325      * @throws IOException
326      *             Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
327      *             delimiter or EOL.
328      */
329     private Token parseEncapsulatedToken(final Token token) throws IOException {
330         token.isQuoted = true;
331         // Save current line number in case needed for IOE
332         final long startLineNumber = getCurrentLineNumber();
333         int c;
334         while (true) {
335             c = reader.read();
336 
337             if (isQuoteChar(c)) {
338                 if (isQuoteChar(reader.lookAhead())) {
339                     // double or escaped encapsulator -> add single encapsulator to token
340                     c = reader.read();
341                     token.content.append((char) c);
342                 } else {
343                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
344                     while (true) {
345                         c = reader.read();
346                         if (isDelimiter(c)) {
347                             token.type = TOKEN;
348                             return token;
349                         }
350                         if (isEndOfFile(c)) {
351                             token.type = Token.Type.EOF;
352                             token.isReady = true; // There is data at EOF
353                             return token;
354                         }
355                         if (readEndOfLine(c)) {
356                             token.type = EORECORD;
357                             return token;
358                         }
359                         if (trailingData) {
360                             token.content.append((char) c);
361                         } else if (!Character.isWhitespace((char) c)) {
362                             // error invalid char between token and next delimiter
363                             throw new IOException(String.format("Invalid char between encapsulated token and delimiter at line: %,d, position: %,d",
364                                     getCurrentLineNumber(), getCharacterPosition()));
365                         }
366                     }
367                 }
368             } else if (isEscape(c)) {
369                 if (isEscapeDelimiter()) {
370                     token.content.append(delimiter);
371                 } else {
372                     final int unescaped = readEscape();
373                     if (unescaped == EOF) { // unexpected char after escape
374                         token.content.append((char) c).append((char) reader.getLastChar());
375                     } else {
376                         token.content.append((char) unescaped);
377                     }
378                 }
379             } else if (isEndOfFile(c)) {
380                 if (lenientEof) {
381                     token.type = Token.Type.EOF;
382                     token.isReady = true; // There is data at EOF
383                     return token;
384                 }
385                 // error condition (end of file before end of token)
386                 throw new IOException("(startline " + startLineNumber +
387                         ") EOF reached before encapsulated token finished");
388             } else {
389                 // consume character
390                 token.content.append((char) c);
391             }
392         }
393     }
394 
395     /**
396      * Parses a simple token.
397      * <p>
398      * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped
399      * delimiters (as \, or \;). The token is finished when one of the following conditions becomes true:
400      * </p>
401      * <ul>
402      * <li>The end of line has been reached (EORECORD)</li>
403      * <li>The end of stream has been reached (EOF)</li>
404      * <li>An unescaped delimiter has been reached (TOKEN)</li>
405      * </ul>
406      *
407      * @param token
408      *            the current token
409      * @param ch
410      *            the current character
411      * @return the filled token
412      * @throws IOException
413      *             on stream access error
414      */
415     private Token parseSimpleToken(final Token token, int ch) throws IOException {
416         // Faster to use while(true)+break than while(token.type == INVALID)
417         while (true) {
418             if (readEndOfLine(ch)) {
419                 token.type = EORECORD;
420                 break;
421             }
422             if (isEndOfFile(ch)) {
423                 token.type = Token.Type.EOF;
424                 token.isReady = true; // There is data at EOF
425                 break;
426             }
427             if (isDelimiter(ch)) {
428                 token.type = TOKEN;
429                 break;
430             }
431             // continue
432             if (isEscape(ch)) {
433                 if (isEscapeDelimiter()) {
434                     token.content.append(delimiter);
435                 } else {
436                     final int unescaped = readEscape();
437                     if (unescaped == EOF) { // unexpected char after escape
438                         token.content.append((char) ch).append((char) reader.getLastChar());
439                     } else {
440                         token.content.append((char) unescaped);
441                     }
442                 }
443             } else {
444                 token.content.append((char) ch);
445             }
446             ch = reader.read(); // continue
447         }
448 
449         if (ignoreSurroundingSpaces) {
450             trimTrailingSpaces(token.content);
451         }
452 
453         return token;
454     }
455 
456     /**
457      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
458      *
459      * @return true if the given or next character is a line-terminator
460      */
461     boolean readEndOfLine(int ch) throws IOException {
462         // check if we have \r\n...
463         if (ch == CR && reader.lookAhead() == LF) {
464             // note: does not change ch outside of this method!
465             ch = reader.read();
466             // Save the EOL state
467             if (firstEol == null) {
468                 this.firstEol = Constants.CRLF;
469             }
470         }
471         // save EOL state here.
472         if (firstEol == null) {
473             if (ch == LF) {
474                 this.firstEol = LF_STRING;
475             } else if (ch == CR) {
476                 this.firstEol = CR_STRING;
477             }
478         }
479 
480         return ch == LF || ch == CR;
481     }
482 
483     // TODO escape handling needs more work
484     /**
485      * Handle an escape sequence.
486      * The current character must be the escape character.
487      * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
488      * on the input stream.
489      *
490      * @return the unescaped character (as an int) or {@link Constants#EOF} if char following the escape is
491      *      invalid.
492      * @throws IOException if there is a problem reading the stream or the end of stream is detected:
493      *      the escape character is not allowed at end of stream
494      */
495     int readEscape() throws IOException {
496         // the escape char has just been read (normally a backslash)
497         final int ch = reader.read();
498         switch (ch) {
499         case 'r':
500             return CR;
501         case 'n':
502             return LF;
503         case 't':
504             return TAB;
505         case 'b':
506             return BACKSPACE;
507         case 'f':
508             return FF;
509         case CR:
510         case LF:
511         case FF: // TODO is this correct?
512         case TAB: // TODO is this correct? Do tabs need to be escaped?
513         case BACKSPACE: // TODO is this correct?
514             return ch;
515         case EOF:
516             throw new IOException("EOF whilst processing escape sequence");
517         default:
518             // Now check for meta-characters
519             if (isMetaChar(ch)) {
520                 return ch;
521             }
522             // indicate unexpected char - available from in.getLastChar()
523             return EOF;
524         }
525     }
526 
527     void trimTrailingSpaces(final StringBuilder buffer) {
528         int length = buffer.length();
529         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
530             length--;
531         }
532         if (length != buffer.length()) {
533             buffer.setLength(length);
534         }
535     }
536 }