1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.io.IOUtils.EOF;
23
24 import java.io.Closeable;
25 import java.io.IOException;
26
27 import org.apache.commons.io.IOUtils;
28
29
30
31
32 final class Lexer implements Closeable {
33
34 private static final String CR_STRING = Character.toString(Constants.CR);
35 private static final String LF_STRING = Character.toString(Constants.LF);
36
37 private final char[] delimiter;
38 private final char[] delimiterBuf;
39 private final char[] escapeDelimiterBuf;
40 private final int escape;
41 private final int quoteChar;
42 private final int commentStart;
43 private final boolean ignoreSurroundingSpaces;
44 private final boolean ignoreEmptyLines;
45 private final boolean lenientEof;
46 private final boolean trailingData;
47
48
49 private final ExtendedBufferedReader reader;
50 private String firstEol;
51
52 private boolean isLastTokenDelimiter;
53
54 Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
55 this.reader = reader;
56 this.delimiter = format.getDelimiterCharArray();
57 this.escape = nullToDisabled(format.getEscapeCharacter());
58 this.quoteChar = nullToDisabled(format.getQuoteCharacter());
59 this.commentStart = nullToDisabled(format.getCommentMarker());
60 this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
61 this.ignoreEmptyLines = format.getIgnoreEmptyLines();
62 this.lenientEof = format.getLenientEof();
63 this.trailingData = format.getTrailingData();
64 this.delimiterBuf = new char[delimiter.length - 1];
65 this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
66 }
67
68
69
70
71
72
73
74
75 private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
76 if (isEscapeDelimiter()) {
77 token.content.append(delimiter);
78 } else {
79 final int unescaped = readEscape();
80 if (unescaped == EOF) {
81 token.content.append((char) escape).append((char) reader.getLastChar());
82 } else {
83 token.content.append((char) unescaped);
84 }
85 }
86 }
87
88
89
90
91
92
93
94 @Override
95 public void close() throws IOException {
96 reader.close();
97 }
98
99
100
101
102
103
104 long getBytesRead() {
105 return reader.getBytesRead();
106 }
107
108
109
110
111
112
113 long getCharacterPosition() {
114 return reader.getPosition();
115 }
116
117
118
119
120
121
122 long getCurrentLineNumber() {
123 return reader.getLineNumber();
124 }
125
126 String getFirstEol() {
127 return firstEol;
128 }
129
130 boolean isClosed() {
131 return reader.isClosed();
132 }
133
134 boolean isCommentStart(final int ch) {
135 return ch == commentStart;
136 }
137
138
139
140
141
142
143
144
145
146 boolean isDelimiter(final int ch) throws IOException {
147 isLastTokenDelimiter = false;
148 if (ch != delimiter[0]) {
149 return false;
150 }
151 if (delimiter.length == 1) {
152 isLastTokenDelimiter = true;
153 return true;
154 }
155 reader.peek(delimiterBuf);
156 for (int i = 0; i < delimiterBuf.length; i++) {
157 if (delimiterBuf[i] != delimiter[i + 1]) {
158 return false;
159 }
160 }
161 final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
162 isLastTokenDelimiter = count != EOF;
163 return isLastTokenDelimiter;
164 }
165
166
167
168
169
170
171 boolean isEndOfFile(final int ch) {
172 return ch == EOF;
173 }
174
175
176
177
178
179
180 boolean isEscape(final int ch) {
181 return ch == escape;
182 }
183
184
185
186
187
188
189
190
191
192 boolean isEscapeDelimiter() throws IOException {
193 reader.peek(escapeDelimiterBuf);
194 if (escapeDelimiterBuf[0] != delimiter[0]) {
195 return false;
196 }
197 for (int i = 1; i < delimiter.length; i++) {
198 if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
199 return false;
200 }
201 }
202 final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
203 return count != EOF;
204 }
205
206 private boolean isMetaChar(final int ch) {
207 return ch == escape || ch == quoteChar || ch == commentStart;
208 }
209
210 boolean isQuoteChar(final int ch) {
211 return ch == quoteChar;
212 }
213
214
215
216
217
218
219
220 boolean isStartOfLine(final int ch) {
221 return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
222 }
223
224
225
226
227
228
229
230
231
232
233
234
235 Token nextToken(final Token token) throws IOException {
236
237 int lastChar = reader.getLastChar();
238
239 int c = reader.read();
240
241 boolean eol = readEndOfLine(c);
242
243 if (ignoreEmptyLines) {
244 while (eol && isStartOfLine(lastChar)) {
245
246 lastChar = c;
247 c = reader.read();
248 eol = readEndOfLine(c);
249
250 if (isEndOfFile(c)) {
251 token.type = Token.Type.EOF;
252
253 return token;
254 }
255 }
256 }
257
258 if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
259 token.type = Token.Type.EOF;
260
261 return token;
262 }
263 if (isStartOfLine(lastChar) && isCommentStart(c)) {
264 final String line = reader.readLine();
265 if (line == null) {
266 token.type = Token.Type.EOF;
267
268 return token;
269 }
270 final String comment = line.trim();
271 token.content.append(comment);
272 token.type = Token.Type.COMMENT;
273 return token;
274 }
275
276 while (token.type == Token.Type.INVALID) {
277
278 if (ignoreSurroundingSpaces) {
279 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
280 c = reader.read();
281 eol = readEndOfLine(c);
282 }
283 }
284
285 if (isDelimiter(c)) {
286
287 token.type = Token.Type.TOKEN;
288 } else if (eol) {
289
290
291 token.type = Token.Type.EORECORD;
292 } else if (isQuoteChar(c)) {
293
294 parseEncapsulatedToken(token);
295 } else if (isEndOfFile(c)) {
296
297
298 token.type = Token.Type.EOF;
299 token.isReady = true;
300 } else {
301
302
303 parseSimpleToken(token, c);
304 }
305 }
306 return token;
307 }
308
309 private int nullToDisabled(final Character c) {
310 return c == null ? Constants.UNDEFINED : c.charValue();
311 }
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336 private Token parseEncapsulatedToken(final Token token) throws IOException {
337 token.isQuoted = true;
338
339 final long startLineNumber = getCurrentLineNumber();
340 int c;
341 while (true) {
342 c = reader.read();
343
344 if (isQuoteChar(c)) {
345 if (isQuoteChar(reader.peek())) {
346
347 c = reader.read();
348 token.content.append((char) c);
349 } else {
350
351 while (true) {
352 c = reader.read();
353 if (isDelimiter(c)) {
354 token.type = Token.Type.TOKEN;
355 return token;
356 }
357 if (isEndOfFile(c)) {
358 token.type = Token.Type.EOF;
359 token.isReady = true;
360 return token;
361 }
362 if (readEndOfLine(c)) {
363 token.type = Token.Type.EORECORD;
364 return token;
365 }
366 if (trailingData) {
367 token.content.append((char) c);
368 } else if (!Character.isWhitespace((char) c)) {
369
370 throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
371 getCurrentLineNumber(), getCharacterPosition());
372 }
373 }
374 }
375 } else if (isEscape(c)) {
376 appendNextEscapedCharacterToToken(token);
377 } else if (isEndOfFile(c)) {
378 if (lenientEof) {
379 token.type = Token.Type.EOF;
380 token.isReady = true;
381 return token;
382 }
383
384 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
385 } else {
386
387 token.content.append((char) c);
388 }
389 }
390 }
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410 private Token parseSimpleToken(final Token token, final int ch) throws IOException {
411
412 int cur = ch;
413 while (true) {
414 if (readEndOfLine(cur)) {
415 token.type = Token.Type.EORECORD;
416 break;
417 }
418 if (isEndOfFile(cur)) {
419 token.type = Token.Type.EOF;
420 token.isReady = true;
421 break;
422 }
423 if (isDelimiter(cur)) {
424 token.type = Token.Type.TOKEN;
425 break;
426 }
427
428 if (isEscape(cur)) {
429 appendNextEscapedCharacterToToken(token);
430 } else {
431 token.content.append((char) cur);
432 }
433 cur = reader.read();
434 }
435
436 if (ignoreSurroundingSpaces) {
437 trimTrailingSpaces(token.content);
438 }
439
440 return token;
441 }
442
443
444
445
446
447
448 boolean readEndOfLine(final int ch) throws IOException {
449
450 int cur = ch;
451 if (cur == Constants.CR && reader.peek() == Constants.LF) {
452
453 cur = reader.read();
454
455 if (firstEol == null) {
456 this.firstEol = Constants.CRLF;
457 }
458 }
459
460 if (firstEol == null) {
461 if (cur == Constants.LF) {
462 this.firstEol = LF_STRING;
463 } else if (cur == Constants.CR) {
464 this.firstEol = CR_STRING;
465 }
466 }
467
468 return cur == Constants.LF || cur == Constants.CR;
469 }
470
471
472
473
474
475
476
477
478
479
480 int readEscape() throws IOException {
481
482 final int ch = reader.read();
483 switch (ch) {
484 case 'r':
485 return Constants.CR;
486 case 'n':
487 return Constants.LF;
488 case 't':
489 return Constants.TAB;
490 case 'b':
491 return Constants.BACKSPACE;
492 case 'f':
493 return Constants.FF;
494 case Constants.CR:
495 case Constants.LF:
496 case Constants.FF:
497 case Constants.TAB:
498 case Constants.BACKSPACE:
499 return ch;
500 case EOF:
501 throw new CSVException("EOF while processing escape sequence");
502 default:
503
504 if (isMetaChar(ch)) {
505 return ch;
506 }
507
508 return EOF;
509 }
510 }
511
512 void trimTrailingSpaces(final StringBuilder buffer) {
513 int length = buffer.length();
514 while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
515 length--;
516 }
517 if (length != buffer.length()) {
518 buffer.setLength(length);
519 }
520 }
521 }