1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.io.IOUtils.EOF;
23
24 import java.io.Closeable;
25 import java.io.IOException;
26
27 import org.apache.commons.io.IOUtils;
28
29 /**
30 * Lexical analyzer.
31 */
32 final class Lexer implements Closeable {
33
34 private static final String CR_STRING = Character.toString(Constants.CR);
35 private static final String LF_STRING = Character.toString(Constants.LF);
36
37 private final char[] delimiter;
38 private final char[] delimiterBuf;
39 private final char[] escapeDelimiterBuf;
40 private final int escape;
41 private final int quoteChar;
42 private final int commentStart;
43 private final boolean ignoreSurroundingSpaces;
44 private final boolean ignoreEmptyLines;
45 private final boolean lenientEof;
46 private final boolean trailingData;
47
48 /** The buffered reader. */
49 private final ExtendedBufferedReader reader;
50 private String firstEol;
51
52 private boolean isLastTokenDelimiter;
53
54 Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
55 this.reader = reader;
56 this.delimiter = format.getDelimiterCharArray();
57 this.escape = nullToDisabled(format.getEscapeCharacter());
58 this.quoteChar = nullToDisabled(format.getQuoteCharacter());
59 this.commentStart = nullToDisabled(format.getCommentMarker());
60 this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
61 this.ignoreEmptyLines = format.getIgnoreEmptyLines();
62 this.lenientEof = format.getLenientEof();
63 this.trailingData = format.getTrailingData();
64 this.delimiterBuf = new char[delimiter.length - 1];
65 this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
66 }
67
68 /**
69 * Appends the next escaped character to the token's content.
70 *
71 * @param token the current token
72 * @throws IOException on stream access error
73 * @throws CSVException Thrown on invalid input.
74 */
75 private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
76 if (isEscapeDelimiter()) {
77 token.content.append(delimiter);
78 } else {
79 final int unescaped = readEscape();
80 if (unescaped == EOF) { // unexpected char after escape
81 token.content.append((char) escape).append((char) reader.getLastChar());
82 } else {
83 token.content.append((char) unescaped);
84 }
85 }
86 }
87
88 /**
89 * Closes resources.
90 *
91 * @throws IOException
92 * If an I/O error occurs
93 */
94 @Override
95 public void close() throws IOException {
96 reader.close();
97 }
98
99 /**
100 * Gets the number of bytes read
101 *
102 * @return the number of bytes read
103 */
104 long getBytesRead() {
105 return reader.getBytesRead();
106 }
107
108 /**
109 * Returns the current character position
110 *
111 * @return the current character position
112 */
113 long getCharacterPosition() {
114 return reader.getPosition();
115 }
116
117 /**
118 * Returns the current line number
119 *
120 * @return the current line number
121 */
122 long getCurrentLineNumber() {
123 return reader.getLineNumber();
124 }
125
126 String getFirstEol() {
127 return firstEol;
128 }
129
130 boolean isClosed() {
131 return reader.isClosed();
132 }
133
134 boolean isCommentStart(final int ch) {
135 return ch == commentStart;
136 }
137
138 /**
139 * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
140 *
141 * @param ch
142 * the current character.
143 * @return true if the next characters constitute a delimiter.
144 * @throws IOException If an I/O error occurs.
145 */
146 boolean isDelimiter(final int ch) throws IOException {
147 isLastTokenDelimiter = false;
148 if (ch != delimiter[0]) {
149 return false;
150 }
151 if (delimiter.length == 1) {
152 isLastTokenDelimiter = true;
153 return true;
154 }
155 reader.peek(delimiterBuf);
156 for (int i = 0; i < delimiterBuf.length; i++) {
157 if (delimiterBuf[i] != delimiter[i + 1]) {
158 return false;
159 }
160 }
161 final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
162 isLastTokenDelimiter = count != EOF;
163 return isLastTokenDelimiter;
164 }
165
166 /**
167 * Tests if the given character indicates the end of the file.
168 *
169 * @return true if the given character indicates the end of the file.
170 */
171 boolean isEndOfFile(final int ch) {
172 return ch == EOF;
173 }
174
175 /**
176 * Tests if the given character is the escape character.
177 *
178 * @return true if the given character is the escape character.
179 */
180 boolean isEscape(final int ch) {
181 return ch == escape;
182 }
183
184 /**
185 * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
186 *
187 * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
188 *
189 * @return true if the next characters constitute an escape delimiter.
190 * @throws IOException If an I/O error occurs.
191 */
192 boolean isEscapeDelimiter() throws IOException {
193 reader.peek(escapeDelimiterBuf);
194 if (escapeDelimiterBuf[0] != delimiter[0]) {
195 return false;
196 }
197 for (int i = 1; i < delimiter.length; i++) {
198 if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
199 return false;
200 }
201 }
202 final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
203 return count != EOF;
204 }
205
206 private boolean isMetaChar(final int ch) {
207 return ch == escape || ch == quoteChar || ch == commentStart;
208 }
209
210 boolean isQuoteChar(final int ch) {
211 return ch == quoteChar;
212 }
213
214 /**
215 * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
216 *
217 * @param ch the character to check
218 * @return true if the character is at the start of a line.
219 */
220 boolean isStartOfLine(final int ch) {
221 return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
222 }
223
224 /**
225 * Returns the next token.
226 * <p>
227 * A token corresponds to a term, a record change or an end-of-file indicator.
228 * </p>
229 *
230 * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
231 * @return the next token found.
232 * @throws IOException on stream access error.
233 * @throws CSVException Thrown on invalid input.
234 */
235 Token nextToken(final Token token) throws IOException {
236 // Get the last read char (required for empty line detection)
237 int lastChar = reader.getLastChar();
238 // read the next char and set eol
239 int c = reader.read();
240 // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
241 boolean eol = readEndOfLine(c);
242 // empty line detection: eol AND (last char was EOL or beginning)
243 if (ignoreEmptyLines) {
244 while (eol && isStartOfLine(lastChar)) {
245 // Go on char ahead ...
246 lastChar = c;
247 c = reader.read();
248 eol = readEndOfLine(c);
249 // reached the end of the file without any content (empty line at the end)
250 if (isEndOfFile(c)) {
251 token.type = Token.Type.EOF;
252 // don't set token.isReady here because no content
253 return token;
254 }
255 }
256 }
257 // Did we reach EOF during the last iteration already? EOF
258 if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
259 token.type = Token.Type.EOF;
260 // don't set token.isReady here because no content
261 return token;
262 }
263 if (isStartOfLine(lastChar) && isCommentStart(c)) {
264 final String line = reader.readLine();
265 if (line == null) {
266 token.type = Token.Type.EOF;
267 // don't set token.isReady here because no content
268 return token;
269 }
270 final String comment = line.trim();
271 token.content.append(comment);
272 token.type = Token.Type.COMMENT;
273 return token;
274 }
275 // Important: make sure a new char gets consumed in each iteration
276 while (token.type == Token.Type.INVALID) {
277 // ignore whitespaces at beginning of a token
278 if (ignoreSurroundingSpaces) {
279 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
280 c = reader.read();
281 eol = readEndOfLine(c);
282 }
283 }
284 // ok, start of token reached: encapsulated, or token
285 if (isDelimiter(c)) {
286 // empty token return TOKEN("")
287 token.type = Token.Type.TOKEN;
288 } else if (eol) {
289 // empty token return EORECORD("")
290 // noop: token.content.append("");
291 token.type = Token.Type.EORECORD;
292 } else if (isQuoteChar(c)) {
293 // consume encapsulated token
294 parseEncapsulatedToken(token);
295 } else if (isEndOfFile(c)) {
296 // end of file return EOF()
297 // noop: token.content.append("");
298 token.type = Token.Type.EOF;
299 token.isReady = true; // there is data at EOF
300 } else {
301 // next token must be a simple token
302 // add removed blanks when not ignoring whitespace chars...
303 parseSimpleToken(token, c);
304 }
305 }
306 return token;
307 }
308
309 private int nullToDisabled(final Character c) {
310 return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
311 }
312
313 /**
314 * Parses an encapsulated token.
315 * <p>
316 * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
317 * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
318 * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
319 * </p>
320 * <ul>
321 * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
322 * <ul>
323 * <li>delimiter (TOKEN)</li>
324 * <li>end of line (EORECORD)</li>
325 * </ul>
326 * <li>end of stream has been reached (EOF)</li> </ul>
327 *
328 * @param token
329 * the current token
330 * @return a valid token object
331 * @throws IOException
332 * Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
333 * delimiter or EOL.
334 * @throws CSVException Thrown on invalid input.
335 */
336 private Token parseEncapsulatedToken(final Token token) throws IOException {
337 token.isQuoted = true;
338 // Save current line number in case needed for IOE
339 final long startLineNumber = getCurrentLineNumber();
340 int c;
341 while (true) {
342 c = reader.read();
343 if (isQuoteChar(c)) {
344 if (isQuoteChar(reader.peek())) {
345 // double or escaped encapsulator -> add single encapsulator to token
346 c = reader.read();
347 token.content.append((char) c);
348 } else {
349 // token finish mark (encapsulator) reached: ignore whitespace till delimiter
350 while (true) {
351 c = reader.read();
352 if (isDelimiter(c)) {
353 token.type = Token.Type.TOKEN;
354 return token;
355 }
356 if (isEndOfFile(c)) {
357 token.type = Token.Type.EOF;
358 token.isReady = true; // There is data at EOF
359 return token;
360 }
361 if (readEndOfLine(c)) {
362 token.type = Token.Type.EORECORD;
363 return token;
364 }
365 if (trailingData) {
366 token.content.append((char) c);
367 } else if (!Character.isWhitespace((char) c)) {
368 // error invalid char between token and next delimiter
369 throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
370 getCurrentLineNumber(), getCharacterPosition());
371 }
372 }
373 }
374 } else if (isEscape(c)) {
375 appendNextEscapedCharacterToToken(token);
376 } else if (isEndOfFile(c)) {
377 if (lenientEof) {
378 token.type = Token.Type.EOF;
379 token.isReady = true; // There is data at EOF
380 return token;
381 }
382 // error condition (end of file before end of token)
383 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
384 } else {
385 // consume character
386 token.content.append((char) c);
387 }
388 }
389 }
390
391 /**
392 * Parses a simple token.
393 * <p>
394 * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
395 * when one of the following conditions becomes true:
396 * </p>
397 * <ul>
398 * <li>The end of line has been reached (EORECORD)</li>
399 * <li>The end of stream has been reached (EOF)</li>
400 * <li>An unescaped delimiter has been reached (TOKEN)</li>
401 * </ul>
402 *
403 * @param token the current token
404 * @param ch the current character
405 * @return the filled token
406 * @throws IOException on stream access error
407 * @throws CSVException Thrown on invalid input.
408 */
409 private Token parseSimpleToken(final Token token, final int ch) throws IOException {
410 // Faster to use while(true)+break than while(token.type == INVALID)
411 int cur = ch;
412 while (true) {
413 if (readEndOfLine(cur)) {
414 token.type = Token.Type.EORECORD;
415 break;
416 }
417 if (isEndOfFile(cur)) {
418 token.type = Token.Type.EOF;
419 token.isReady = true; // There is data at EOF
420 break;
421 }
422 if (isDelimiter(cur)) {
423 token.type = Token.Type.TOKEN;
424 break;
425 }
426 // continue
427 if (isEscape(cur)) {
428 appendNextEscapedCharacterToToken(token);
429 } else {
430 token.content.append((char) cur);
431 }
432 cur = reader.read(); // continue
433 }
434
435 if (ignoreSurroundingSpaces) {
436 trimTrailingSpaces(token.content);
437 }
438
439 return token;
440 }
441
442 /**
443 * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
444 *
445 * @return true if the given or next character is a line-terminator
446 */
447 boolean readEndOfLine(final int ch) throws IOException {
448 // check if we have \r\n...
449 int cur = ch;
450 if (cur == Constants.CR && reader.peek() == Constants.LF) {
451 // note: does not change ch outside of this method!
452 cur = reader.read();
453 // Save the EOL state
454 if (firstEol == null) {
455 this.firstEol = Constants.CRLF;
456 }
457 }
458 // save EOL state here.
459 if (firstEol == null) {
460 if (cur == Constants.LF) {
461 this.firstEol = LF_STRING;
462 } else if (cur == Constants.CR) {
463 this.firstEol = CR_STRING;
464 }
465 }
466
467 return cur == Constants.LF || cur == Constants.CR;
468 }
469
470 // TODO escape handling needs more work
471 /**
472 * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
473 * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
474 *
475 * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
476 * @throws IOException if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
477 * @throws CSVException Thrown on invalid input.
478 */
479 int readEscape() throws IOException {
480 // the escape char has just been read (normally a backslash)
481 final int ch = reader.read();
482 switch (ch) {
483 case 'r':
484 return Constants.CR;
485 case 'n':
486 return Constants.LF;
487 case 't':
488 return Constants.TAB;
489 case 'b':
490 return Constants.BACKSPACE;
491 case 'f':
492 return Constants.FF;
493 case Constants.CR:
494 case Constants.LF:
495 case Constants.FF: // TODO is this correct?
496 case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
497 case Constants.BACKSPACE: // TODO is this correct?
498 return ch;
499 case EOF:
500 throw new CSVException("EOF while processing escape sequence");
501 default:
502 // Now check for meta-characters
503 if (isMetaChar(ch)) {
504 return ch;
505 }
506 // indicate unexpected char - available from in.getLastChar()
507 return EOF;
508 }
509 }
510
511 void trimTrailingSpaces(final StringBuilder buffer) {
512 int length = buffer.length();
513 while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
514 length--;
515 }
516 if (length != buffer.length()) {
517 buffer.setLength(length);
518 }
519 }
520 }