View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Constants.BACKSPACE;
21  import static org.apache.commons.csv.Constants.CR;
22  import static org.apache.commons.csv.Constants.END_OF_STREAM;
23  import static org.apache.commons.csv.Constants.FF;
24  import static org.apache.commons.csv.Constants.LF;
25  import static org.apache.commons.csv.Constants.TAB;
26  import static org.apache.commons.csv.Constants.UNDEFINED;
27  import static org.apache.commons.csv.Token.Type.COMMENT;
28  import static org.apache.commons.csv.Token.Type.EOF;
29  import static org.apache.commons.csv.Token.Type.EORECORD;
30  import static org.apache.commons.csv.Token.Type.INVALID;
31  import static org.apache.commons.csv.Token.Type.TOKEN;
32  
33  import java.io.Closeable;
34  import java.io.IOException;
35  
36  /**
37   * Lexical analyzer.
38   */
39  final class Lexer implements Closeable {
40  
41      private static final String CR_STRING = Character.toString(Constants.CR);
42      private static final String LF_STRING = Character.toString(Constants.LF);
43  
44      /**
45       * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
46       * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
47       * chars (using surrogates) and thus there should never be a collision with a real text char.
48       */
49      private static final char DISABLED = '\ufffe';
50  
51      private final char delimiter;
52      private final char escape;
53      private final char quoteChar;
54      private final char commentStart;
55  
56      private final boolean ignoreSurroundingSpaces;
57      private final boolean ignoreEmptyLines;
58  
59      /** The input stream */
60      private final ExtendedBufferedReader reader;
61      private String firstEol;
62  
63      String getFirstEol(){
64          return firstEol;
65      }
66  
67      Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
68          this.reader = reader;
69          this.delimiter = format.getDelimiter();
70          this.escape = mapNullToDisabled(format.getEscapeCharacter());
71          this.quoteChar = mapNullToDisabled(format.getQuoteCharacter());
72          this.commentStart = mapNullToDisabled(format.getCommentMarker());
73          this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
74          this.ignoreEmptyLines = format.getIgnoreEmptyLines();
75      }
76  
77      /**
78       * Returns the next token.
79       * <p>
80       * A token corresponds to a term, a record change or an end-of-file indicator.
81       * </p>
82       *
83       * @param token
84       *            an existing Token object to reuse. The caller is responsible to initialize the Token.
85       * @return the next token found
86       * @throws java.io.IOException
87       *             on stream access error
88       */
89      Token nextToken(final Token token) throws IOException {
90  
91          // get the last read char (required for empty line detection)
92          int lastChar = reader.getLastChar();
93  
94          // read the next char and set eol
95          int c = reader.read();
96          /*
97           * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF
98           * - they are equivalent here.
99           */
100         boolean eol = readEndOfLine(c);
101 
102         // empty line detection: eol AND (last char was EOL or beginning)
103         if (ignoreEmptyLines) {
104             while (eol && isStartOfLine(lastChar)) {
105                 // go on char ahead ...
106                 lastChar = c;
107                 c = reader.read();
108                 eol = readEndOfLine(c);
109                 // reached end of file without any content (empty line at the end)
110                 if (isEndOfFile(c)) {
111                     token.type = EOF;
112                     // don't set token.isReady here because no content
113                     return token;
114                 }
115             }
116         }
117 
118         // did we reach eof during the last iteration already ? EOF
119         if (isEndOfFile(lastChar) || !isDelimiter(lastChar) && isEndOfFile(c)) {
120             token.type = EOF;
121             // don't set token.isReady here because no content
122             return token;
123         }
124 
125         if (isStartOfLine(lastChar) && isCommentStart(c)) {
126             final String line = reader.readLine();
127             if (line == null) {
128                 token.type = EOF;
129                 // don't set token.isReady here because no content
130                 return token;
131             }
132             final String comment = line.trim();
133             token.content.append(comment);
134             token.type = COMMENT;
135             return token;
136         }
137 
138         // important: make sure a new char gets consumed in each iteration
139         while (token.type == INVALID) {
140             // ignore whitespaces at beginning of a token
141             if (ignoreSurroundingSpaces) {
142                 while (isWhitespace(c) && !eol) {
143                     c = reader.read();
144                     eol = readEndOfLine(c);
145                 }
146             }
147 
148             // ok, start of token reached: encapsulated, or token
149             if (isDelimiter(c)) {
150                 // empty token return TOKEN("")
151                 token.type = TOKEN;
152             } else if (eol) {
153                 // empty token return EORECORD("")
154                 // noop: token.content.append("");
155                 token.type = EORECORD;
156             } else if (isQuoteChar(c)) {
157                 // consume encapsulated token
158                 parseEncapsulatedToken(token);
159             } else if (isEndOfFile(c)) {
160                 // end of file return EOF()
161                 // noop: token.content.append("");
162                 token.type = EOF;
163                 token.isReady = true; // there is data at EOF
164             } else {
165                 // next token must be a simple token
166                 // add removed blanks when not ignoring whitespace chars...
167                 parseSimpleToken(token, c);
168             }
169         }
170         return token;
171     }
172 
173     /**
174      * Parses a simple token.
175      * <p/>
176      * Simple token are tokens which are not surrounded by encapsulators. A simple token might contain escaped
177      * delimiters (as \, or \;). The token is finished when one of the following conditions become true:
178      * <ul>
179      * <li>end of line has been reached (EORECORD)</li>
180      * <li>end of stream has been reached (EOF)</li>
181      * <li>an unescaped delimiter has been reached (TOKEN)</li>
182      * </ul>
183      *
184      * @param token
185      *            the current token
186      * @param ch
187      *            the current character
188      * @return the filled token
189      * @throws IOException
190      *             on stream access error
191      */
192     private Token parseSimpleToken(final Token token, int ch) throws IOException {
193         // Faster to use while(true)+break than while(token.type == INVALID)
194         while (true) {
195             if (readEndOfLine(ch)) {
196                 token.type = EORECORD;
197                 break;
198             } else if (isEndOfFile(ch)) {
199                 token.type = EOF;
200                 token.isReady = true; // There is data at EOF
201                 break;
202             } else if (isDelimiter(ch)) {
203                 token.type = TOKEN;
204                 break;
205             } else if (isEscape(ch)) {
206                 final int unescaped = readEscape();
207                 if (unescaped == END_OF_STREAM) { // unexpected char after escape
208                     token.content.append((char) ch).append((char) reader.getLastChar());
209                 } else {
210                     token.content.append((char) unescaped);
211                 }
212                 ch = reader.read(); // continue
213             } else {
214                 token.content.append((char) ch);
215                 ch = reader.read(); // continue
216             }
217         }
218 
219         if (ignoreSurroundingSpaces) {
220             trimTrailingSpaces(token.content);
221         }
222 
223         return token;
224     }
225 
226     /**
227      * Parses an encapsulated token.
228      * <p/>
229      * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included
230      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
231      * an encapsulated token are ignored. The token is finished when one of the following conditions become true:
232      * <ul>
233      * <li>an unescaped encapsulator has been reached, and is followed by optional whitespace then:</li>
234      * <ul>
235      * <li>delimiter (TOKEN)</li>
236      * <li>end of line (EORECORD)</li>
237      * </ul>
238      * <li>end of stream has been reached (EOF)</li> </ul>
239      *
240      * @param token
241      *            the current token
242      * @return a valid token object
243      * @throws IOException
244      *             on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL
245      */
246     private Token parseEncapsulatedToken(final Token token) throws IOException {
247         // save current line number in case needed for IOE
248         final long startLineNumber = getCurrentLineNumber();
249         int c;
250         while (true) {
251             c = reader.read();
252 
253             if (isEscape(c)) {
254                 final int unescaped = readEscape();
255                 if (unescaped == END_OF_STREAM) { // unexpected char after escape
256                     token.content.append((char) c).append((char) reader.getLastChar());
257                 } else {
258                     token.content.append((char) unescaped);
259                 }
260             } else if (isQuoteChar(c)) {
261                 if (isQuoteChar(reader.lookAhead())) {
262                     // double or escaped encapsulator -> add single encapsulator to token
263                     c = reader.read();
264                     token.content.append((char) c);
265                 } else {
266                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
267                     while (true) {
268                         c = reader.read();
269                         if (isDelimiter(c)) {
270                             token.type = TOKEN;
271                             return token;
272                         } else if (isEndOfFile(c)) {
273                             token.type = EOF;
274                             token.isReady = true; // There is data at EOF
275                             return token;
276                         } else if (readEndOfLine(c)) {
277                             token.type = EORECORD;
278                             return token;
279                         } else if (!isWhitespace(c)) {
280                             // error invalid char between token and next delimiter
281                             throw new IOException("(line " + getCurrentLineNumber() +
282                                     ") invalid char between encapsulated token and delimiter");
283                         }
284                     }
285                 }
286             } else if (isEndOfFile(c)) {
287                 // error condition (end of file before end of token)
288                 throw new IOException("(startline " + startLineNumber +
289                         ") EOF reached before encapsulated token finished");
290             } else {
291                 // consume character
292                 token.content.append((char) c);
293             }
294         }
295     }
296 
297     private char mapNullToDisabled(final Character c) {
298         return c == null ? DISABLED : c.charValue();
299     }
300 
301     /**
302      * Returns the current line number
303      *
304      * @return the current line number
305      */
306     long getCurrentLineNumber() {
307         return reader.getCurrentLineNumber();
308     }
309 
310     /**
311      * Returns the current character position
312      *
313      * @return the current character position
314      */
315     long getCharacterPosition() {
316         return reader.getPosition();
317     }
318 
319     // TODO escape handling needs more work
320     /**
321      * Handle an escape sequence.
322      * The current character must be the escape character.
323      * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
324      * on the input stream.
325      *
326      * @return the unescaped character (as an int) or {@link Constants#END_OF_STREAM} if char following the escape is
327      *      invalid.
328      * @throws IOException if there is a problem reading the stream or the end of stream is detected:
329      *      the escape character is not allowed at end of strem
330      */
331     int readEscape() throws IOException {
332         // the escape char has just been read (normally a backslash)
333         final int ch = reader.read();
334         switch (ch) {
335         case 'r':
336             return CR;
337         case 'n':
338             return LF;
339         case 't':
340             return TAB;
341         case 'b':
342             return BACKSPACE;
343         case 'f':
344             return FF;
345         case CR:
346         case LF:
347         case FF: // TODO is this correct?
348         case TAB: // TODO is this correct? Do tabs need to be escaped?
349         case BACKSPACE: // TODO is this correct?
350             return ch;
351         case END_OF_STREAM:
352             throw new IOException("EOF whilst processing escape sequence");
353         default:
354             // Now check for meta-characters
355             if (isMetaChar(ch)) {
356                 return ch;
357             }
358             // indicate unexpected char - available from in.getLastChar()
359             return END_OF_STREAM;
360         }
361     }
362 
363     void trimTrailingSpaces(final StringBuilder buffer) {
364         int length = buffer.length();
365         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
366             length = length - 1;
367         }
368         if (length != buffer.length()) {
369             buffer.setLength(length);
370         }
371     }
372 
373     /**
374      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
375      *
376      * @return true if the given or next character is a line-terminator
377      */
378     boolean readEndOfLine(int ch) throws IOException {
379         // check if we have \r\n...
380         if (ch == CR && reader.lookAhead() == LF) {
381             // note: does not change ch outside of this method!
382             ch = reader.read();
383             // Save the EOL state
384             if (firstEol == null) {
385                 this.firstEol = Constants.CRLF;
386             }
387         }
388         // save EOL state here.
389         if (firstEol == null) {
390             if (ch == LF) {
391                 this.firstEol = LF_STRING;
392             } else if (ch == CR) {
393                 this.firstEol = CR_STRING;
394             }
395         }
396 
397         return ch == LF || ch == CR;
398     }
399 
400     boolean isClosed() {
401         return reader.isClosed();
402     }
403 
404     /**
405      * @return true if the given char is a whitespace character
406      */
407     boolean isWhitespace(final int ch) {
408         return !isDelimiter(ch) && Character.isWhitespace((char) ch);
409     }
410 
411     /**
412      * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
413      *
414      * @param ch the character to check
415      * @return true if the character is at the start of a line.
416      */
417     boolean isStartOfLine(final int ch) {
418         return ch == LF || ch == CR || ch == UNDEFINED;
419     }
420 
421     /**
422      * @return true if the given character indicates end of file
423      */
424     boolean isEndOfFile(final int ch) {
425         return ch == END_OF_STREAM;
426     }
427 
428     boolean isDelimiter(final int ch) {
429         return ch == delimiter;
430     }
431 
432     boolean isEscape(final int ch) {
433         return ch == escape;
434     }
435 
436     boolean isQuoteChar(final int ch) {
437         return ch == quoteChar;
438     }
439 
440     boolean isCommentStart(final int ch) {
441         return ch == commentStart;
442     }
443 
444     private boolean isMetaChar(final int ch) {
445         return ch == delimiter ||
446                ch == escape ||
447                ch == quoteChar ||
448                ch == commentStart;
449     }
450 
451     /**
452      * Closes resources.
453      *
454      * @throws IOException
455      *             If an I/O error occurs
456      */
457     @Override
458     public void close() throws IOException {
459         reader.close();
460     }
461 }