1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.csv;
19
20 import static org.apache.commons.csv.Constants.BACKSPACE;
21 import static org.apache.commons.csv.Constants.CR;
22 import static org.apache.commons.csv.Constants.FF;
23 import static org.apache.commons.csv.Constants.LF;
24 import static org.apache.commons.csv.Constants.TAB;
25 import static org.apache.commons.csv.Constants.UNDEFINED;
26 import static org.apache.commons.csv.Token.Type.COMMENT;
27 import static org.apache.commons.csv.Token.Type.EORECORD;
28 import static org.apache.commons.csv.Token.Type.INVALID;
29 import static org.apache.commons.csv.Token.Type.TOKEN;
30 import static org.apache.commons.io.IOUtils.EOF;
31
32 import java.io.Closeable;
33 import java.io.IOException;
34
35
36
37
38 final class Lexer implements Closeable {
39
40 private static final String CR_STRING = Character.toString(CR);
41 private static final String LF_STRING = Character.toString(LF);
42
43
44
45
46
47
48 private static final char DISABLED = '\ufffe';
49
50 private final char[] delimiter;
51 private final char[] delimiterBuf;
52 private final char[] escapeDelimiterBuf;
53 private final char escape;
54 private final char quoteChar;
55 private final char commentStart;
56 private final boolean ignoreSurroundingSpaces;
57 private final boolean ignoreEmptyLines;
58 private final boolean lenientEof;
59 private final boolean trailingData;
60
61
62 private final ExtendedBufferedReader reader;
63 private String firstEol;
64
65 private boolean isLastTokenDelimiter;
66
67 Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
68 this.reader = reader;
69 this.delimiter = format.getDelimiterCharArray();
70 this.escape = mapNullToDisabled(format.getEscapeCharacter());
71 this.quoteChar = mapNullToDisabled(format.getQuoteCharacter());
72 this.commentStart = mapNullToDisabled(format.getCommentMarker());
73 this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
74 this.ignoreEmptyLines = format.getIgnoreEmptyLines();
75 this.lenientEof = format.getLenientEof();
76 this.trailingData = format.getTrailingData();
77 this.delimiterBuf = new char[delimiter.length - 1];
78 this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
79 }
80
81
82
83
84
85
86
87 @Override
88 public void close() throws IOException {
89 reader.close();
90 }
91
92
93
94
95
96
97 long getCharacterPosition() {
98 return reader.getPosition();
99 }
100
101
102
103
104
105
106 long getCurrentLineNumber() {
107 return reader.getCurrentLineNumber();
108 }
109
110 String getFirstEol(){
111 return firstEol;
112 }
113
114 boolean isClosed() {
115 return reader.isClosed();
116 }
117
118 boolean isCommentStart(final int ch) {
119 return ch == commentStart;
120 }
121
122
123
124
125
126
127
128
129
130 boolean isDelimiter(final int ch) throws IOException {
131 isLastTokenDelimiter = false;
132 if (ch != delimiter[0]) {
133 return false;
134 }
135 if (delimiter.length == 1) {
136 isLastTokenDelimiter = true;
137 return true;
138 }
139 reader.lookAhead(delimiterBuf);
140 for (int i = 0; i < delimiterBuf.length; i++) {
141 if (delimiterBuf[i] != delimiter[i+1]) {
142 return false;
143 }
144 }
145 final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
146 isLastTokenDelimiter = count != EOF;
147 return isLastTokenDelimiter;
148 }
149
150
151
152
153
154
155 boolean isEndOfFile(final int ch) {
156 return ch == EOF;
157 }
158
159
160
161
162
163
164 boolean isEscape(final int ch) {
165 return ch == escape;
166 }
167
168
169
170
171
172
173
174
175
176 boolean isEscapeDelimiter() throws IOException {
177 reader.lookAhead(escapeDelimiterBuf);
178 if (escapeDelimiterBuf[0] != delimiter[0]) {
179 return false;
180 }
181 for (int i = 1; i < delimiter.length; i++) {
182 if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
183 return false;
184 }
185 }
186 final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
187 return count != EOF;
188 }
189
190 private boolean isMetaChar(final int ch) {
191 return ch == escape || ch == quoteChar || ch == commentStart;
192 }
193
194 boolean isQuoteChar(final int ch) {
195 return ch == quoteChar;
196 }
197
198
199
200
201
202
203
204 boolean isStartOfLine(final int ch) {
205 return ch == LF || ch == CR || ch == UNDEFINED;
206 }
207
208 private char mapNullToDisabled(final Character c) {
209 return c == null ? DISABLED : c.charValue();
210 }
211
212
213
214
215
216
217
218
219
220
221
222
223 Token nextToken(final Token token) throws IOException {
224
225
226 int lastChar = reader.getLastChar();
227
228
229 int c = reader.read();
230
231
232
233
234 boolean eol = readEndOfLine(c);
235
236
237 if (ignoreEmptyLines) {
238 while (eol && isStartOfLine(lastChar)) {
239
240 lastChar = c;
241 c = reader.read();
242 eol = readEndOfLine(c);
243
244 if (isEndOfFile(c)) {
245 token.type = Token.Type.EOF;
246
247 return token;
248 }
249 }
250 }
251
252
253 if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
254 token.type = Token.Type.EOF;
255
256 return token;
257 }
258
259 if (isStartOfLine(lastChar) && isCommentStart(c)) {
260 final String line = reader.readLine();
261 if (line == null) {
262 token.type = Token.Type.EOF;
263
264 return token;
265 }
266 final String comment = line.trim();
267 token.content.append(comment);
268 token.type = COMMENT;
269 return token;
270 }
271
272
273 while (token.type == INVALID) {
274
275 if (ignoreSurroundingSpaces) {
276 while (Character.isWhitespace((char)c) && !isDelimiter(c) && !eol) {
277 c = reader.read();
278 eol = readEndOfLine(c);
279 }
280 }
281
282
283 if (isDelimiter(c)) {
284
285 token.type = TOKEN;
286 } else if (eol) {
287
288
289 token.type = EORECORD;
290 } else if (isQuoteChar(c)) {
291
292 parseEncapsulatedToken(token);
293 } else if (isEndOfFile(c)) {
294
295
296 token.type = Token.Type.EOF;
297 token.isReady = true;
298 } else {
299
300
301 parseSimpleToken(token, c);
302 }
303 }
304 return token;
305 }
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329 private Token parseEncapsulatedToken(final Token token) throws IOException {
330 token.isQuoted = true;
331
332 final long startLineNumber = getCurrentLineNumber();
333 int c;
334 while (true) {
335 c = reader.read();
336
337 if (isQuoteChar(c)) {
338 if (isQuoteChar(reader.lookAhead())) {
339
340 c = reader.read();
341 token.content.append((char) c);
342 } else {
343
344 while (true) {
345 c = reader.read();
346 if (isDelimiter(c)) {
347 token.type = TOKEN;
348 return token;
349 }
350 if (isEndOfFile(c)) {
351 token.type = Token.Type.EOF;
352 token.isReady = true;
353 return token;
354 }
355 if (readEndOfLine(c)) {
356 token.type = EORECORD;
357 return token;
358 }
359 if (trailingData) {
360 token.content.append((char) c);
361 } else if (!Character.isWhitespace((char) c)) {
362
363 throw new IOException(String.format("Invalid char between encapsulated token and delimiter at line: %,d, position: %,d",
364 getCurrentLineNumber(), getCharacterPosition()));
365 }
366 }
367 }
368 } else if (isEscape(c)) {
369 if (isEscapeDelimiter()) {
370 token.content.append(delimiter);
371 } else {
372 final int unescaped = readEscape();
373 if (unescaped == EOF) {
374 token.content.append((char) c).append((char) reader.getLastChar());
375 } else {
376 token.content.append((char) unescaped);
377 }
378 }
379 } else if (isEndOfFile(c)) {
380 if (lenientEof) {
381 token.type = Token.Type.EOF;
382 token.isReady = true;
383 return token;
384 }
385
386 throw new IOException("(startline " + startLineNumber +
387 ") EOF reached before encapsulated token finished");
388 } else {
389
390 token.content.append((char) c);
391 }
392 }
393 }
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415 private Token parseSimpleToken(final Token token, int ch) throws IOException {
416
417 while (true) {
418 if (readEndOfLine(ch)) {
419 token.type = EORECORD;
420 break;
421 }
422 if (isEndOfFile(ch)) {
423 token.type = Token.Type.EOF;
424 token.isReady = true;
425 break;
426 }
427 if (isDelimiter(ch)) {
428 token.type = TOKEN;
429 break;
430 }
431
432 if (isEscape(ch)) {
433 if (isEscapeDelimiter()) {
434 token.content.append(delimiter);
435 } else {
436 final int unescaped = readEscape();
437 if (unescaped == EOF) {
438 token.content.append((char) ch).append((char) reader.getLastChar());
439 } else {
440 token.content.append((char) unescaped);
441 }
442 }
443 } else {
444 token.content.append((char) ch);
445 }
446 ch = reader.read();
447 }
448
449 if (ignoreSurroundingSpaces) {
450 trimTrailingSpaces(token.content);
451 }
452
453 return token;
454 }
455
456
457
458
459
460
461 boolean readEndOfLine(int ch) throws IOException {
462
463 if (ch == CR && reader.lookAhead() == LF) {
464
465 ch = reader.read();
466
467 if (firstEol == null) {
468 this.firstEol = Constants.CRLF;
469 }
470 }
471
472 if (firstEol == null) {
473 if (ch == LF) {
474 this.firstEol = LF_STRING;
475 } else if (ch == CR) {
476 this.firstEol = CR_STRING;
477 }
478 }
479
480 return ch == LF || ch == CR;
481 }
482
483
484
485
486
487
488
489
490
491
492
493
494
495 int readEscape() throws IOException {
496
497 final int ch = reader.read();
498 switch (ch) {
499 case 'r':
500 return CR;
501 case 'n':
502 return LF;
503 case 't':
504 return TAB;
505 case 'b':
506 return BACKSPACE;
507 case 'f':
508 return FF;
509 case CR:
510 case LF:
511 case FF:
512 case TAB:
513 case BACKSPACE:
514 return ch;
515 case EOF:
516 throw new IOException("EOF whilst processing escape sequence");
517 default:
518
519 if (isMetaChar(ch)) {
520 return ch;
521 }
522
523 return EOF;
524 }
525 }
526
527 void trimTrailingSpaces(final StringBuilder buffer) {
528 int length = buffer.length();
529 while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
530 length--;
531 }
532 if (length != buffer.length()) {
533 buffer.setLength(length);
534 }
535 }
536 }