1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.io.IOUtils.EOF;
23
24 import java.io.Closeable;
25 import java.io.IOException;
26
27 import org.apache.commons.io.IOUtils;
28
29
30
31
32 final class Lexer implements Closeable {
33
34 private static final String CR_STRING = Character.toString(Constants.CR);
35 private static final String LF_STRING = Character.toString(Constants.LF);
36
37 private final char[] delimiter;
38 private final char[] delimiterBuf;
39 private final char[] escapeDelimiterBuf;
40 private final int escape;
41 private final int quoteChar;
42 private final int commentStart;
43 private final boolean ignoreSurroundingSpaces;
44 private final boolean ignoreEmptyLines;
45 private final boolean lenientEof;
46 private final boolean trailingData;
47
48
49 private final ExtendedBufferedReader reader;
50 private String firstEol;
51
52 private boolean isLastTokenDelimiter;
53
54 Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
55 this.reader = reader;
56 this.delimiter = format.getDelimiterCharArray();
57 this.escape = nullToDisabled(format.getEscapeCharacter());
58 this.quoteChar = nullToDisabled(format.getQuoteCharacter());
59 this.commentStart = nullToDisabled(format.getCommentMarker());
60 this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
61 this.ignoreEmptyLines = format.getIgnoreEmptyLines();
62 this.lenientEof = format.getLenientEof();
63 this.trailingData = format.getTrailingData();
64 this.delimiterBuf = new char[delimiter.length - 1];
65 this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
66 }
67
68
69
70
71
72
73
74
75 private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
76 if (isEscapeDelimiter()) {
77 token.content.append(delimiter);
78 } else {
79 final int unescaped = readEscape();
80 if (unescaped == EOF) {
81 token.content.append((char) escape).append((char) reader.getLastChar());
82 } else {
83 token.content.append((char) unescaped);
84 }
85 }
86 }
87
88
89
90
91
92
93
94 @Override
95 public void close() throws IOException {
96 reader.close();
97 }
98
99
100
101
102
103
104 long getBytesRead() {
105 return reader.getBytesRead();
106 }
107
108
109
110
111
112
113 long getCharacterPosition() {
114 return reader.getPosition();
115 }
116
117
118
119
120
121
122 long getCurrentLineNumber() {
123 return reader.getLineNumber();
124 }
125
126 String getFirstEol() {
127 return firstEol;
128 }
129
130 boolean isClosed() {
131 return reader.isClosed();
132 }
133
134 boolean isCommentStart(final int ch) {
135 return ch == commentStart;
136 }
137
138
139
140
141
142
143
144
145
146 boolean isDelimiter(final int ch) throws IOException {
147 isLastTokenDelimiter = false;
148 if (ch != delimiter[0]) {
149 return false;
150 }
151 if (delimiter.length == 1) {
152 isLastTokenDelimiter = true;
153 return true;
154 }
155 reader.peek(delimiterBuf);
156 for (int i = 0; i < delimiterBuf.length; i++) {
157 if (delimiterBuf[i] != delimiter[i + 1]) {
158 return false;
159 }
160 }
161 final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
162 isLastTokenDelimiter = count != EOF;
163 return isLastTokenDelimiter;
164 }
165
166
167
168
169
170
171 boolean isEndOfFile(final int ch) {
172 return ch == EOF;
173 }
174
175
176
177
178
179
180 boolean isEscape(final int ch) {
181 return ch == escape;
182 }
183
184
185
186
187
188
189
190
191
192 boolean isEscapeDelimiter() throws IOException {
193 reader.peek(escapeDelimiterBuf);
194 if (escapeDelimiterBuf[0] != delimiter[0]) {
195 return false;
196 }
197 for (int i = 1; i < delimiter.length; i++) {
198 if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
199 return false;
200 }
201 }
202 final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
203 return count != EOF;
204 }
205
206 private boolean isMetaChar(final int ch) {
207 return ch == escape || ch == quoteChar || ch == commentStart;
208 }
209
210 boolean isQuoteChar(final int ch) {
211 return ch == quoteChar;
212 }
213
214
215
216
217
218
219
220 boolean isStartOfLine(final int ch) {
221 return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
222 }
223
224
225
226
227
228
229
230
231
232
233
234
235 Token nextToken(final Token token) throws IOException {
236
237 int lastChar = reader.getLastChar();
238
239 int c = reader.read();
240
241 boolean eol = readEndOfLine(c);
242
243 if (ignoreEmptyLines) {
244 while (eol && isStartOfLine(lastChar)) {
245
246 lastChar = c;
247 c = reader.read();
248 eol = readEndOfLine(c);
249
250 if (isEndOfFile(c)) {
251 token.type = Token.Type.EOF;
252
253 return token;
254 }
255 }
256 }
257
258 if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
259 token.type = Token.Type.EOF;
260
261 return token;
262 }
263 if (isStartOfLine(lastChar) && isCommentStart(c)) {
264 final String line = reader.readLine();
265 if (line == null) {
266 token.type = Token.Type.EOF;
267
268 return token;
269 }
270 final String comment = line.trim();
271 token.content.append(comment);
272 token.type = Token.Type.COMMENT;
273 return token;
274 }
275
276 while (token.type == Token.Type.INVALID) {
277
278 if (ignoreSurroundingSpaces) {
279 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
280 c = reader.read();
281 eol = readEndOfLine(c);
282 }
283 }
284
285 if (isDelimiter(c)) {
286
287 token.type = Token.Type.TOKEN;
288 } else if (eol) {
289
290
291 token.type = Token.Type.EORECORD;
292 } else if (isQuoteChar(c)) {
293
294 parseEncapsulatedToken(token);
295 } else if (isEndOfFile(c)) {
296
297
298 token.type = Token.Type.EOF;
299 token.isReady = true;
300 } else {
301
302
303 parseSimpleToken(token, c);
304 }
305 }
306 return token;
307 }
308
309 private int nullToDisabled(final Character c) {
310 return c == null ? Constants.UNDEFINED : c.charValue();
311 }
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336 private Token parseEncapsulatedToken(final Token token) throws IOException {
337 token.isQuoted = true;
338
339 final long startLineNumber = getCurrentLineNumber();
340 int c;
341 while (true) {
342 c = reader.read();
343 if (isQuoteChar(c)) {
344 if (isQuoteChar(reader.peek())) {
345
346 c = reader.read();
347 token.content.append((char) c);
348 } else {
349
350 while (true) {
351 c = reader.read();
352 if (isDelimiter(c)) {
353 token.type = Token.Type.TOKEN;
354 return token;
355 }
356 if (isEndOfFile(c)) {
357 token.type = Token.Type.EOF;
358 token.isReady = true;
359 return token;
360 }
361 if (readEndOfLine(c)) {
362 token.type = Token.Type.EORECORD;
363 return token;
364 }
365 if (trailingData) {
366 token.content.append((char) c);
367 } else if (!Character.isWhitespace((char) c)) {
368
369 throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
370 getCurrentLineNumber(), getCharacterPosition());
371 }
372 }
373 }
374 } else if (isEscape(c)) {
375 appendNextEscapedCharacterToToken(token);
376 } else if (isEndOfFile(c)) {
377 if (lenientEof) {
378 token.type = Token.Type.EOF;
379 token.isReady = true;
380 return token;
381 }
382
383 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
384 } else {
385
386 token.content.append((char) c);
387 }
388 }
389 }
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409 private Token parseSimpleToken(final Token token, final int ch) throws IOException {
410
411 int cur = ch;
412 while (true) {
413 if (readEndOfLine(cur)) {
414 token.type = Token.Type.EORECORD;
415 break;
416 }
417 if (isEndOfFile(cur)) {
418 token.type = Token.Type.EOF;
419 token.isReady = true;
420 break;
421 }
422 if (isDelimiter(cur)) {
423 token.type = Token.Type.TOKEN;
424 break;
425 }
426
427 if (isEscape(cur)) {
428 appendNextEscapedCharacterToToken(token);
429 } else {
430 token.content.append((char) cur);
431 }
432 cur = reader.read();
433 }
434
435 if (ignoreSurroundingSpaces) {
436 trimTrailingSpaces(token.content);
437 }
438
439 return token;
440 }
441
442
443
444
445
446
447 boolean readEndOfLine(final int ch) throws IOException {
448
449 int cur = ch;
450 if (cur == Constants.CR && reader.peek() == Constants.LF) {
451
452 cur = reader.read();
453
454 if (firstEol == null) {
455 this.firstEol = Constants.CRLF;
456 }
457 }
458
459 if (firstEol == null) {
460 if (cur == Constants.LF) {
461 this.firstEol = LF_STRING;
462 } else if (cur == Constants.CR) {
463 this.firstEol = CR_STRING;
464 }
465 }
466
467 return cur == Constants.LF || cur == Constants.CR;
468 }
469
470
471
472
473
474
475
476
477
478
479 int readEscape() throws IOException {
480
481 final int ch = reader.read();
482 switch (ch) {
483 case 'r':
484 return Constants.CR;
485 case 'n':
486 return Constants.LF;
487 case 't':
488 return Constants.TAB;
489 case 'b':
490 return Constants.BACKSPACE;
491 case 'f':
492 return Constants.FF;
493 case Constants.CR:
494 case Constants.LF:
495 case Constants.FF:
496 case Constants.TAB:
497 case Constants.BACKSPACE:
498 return ch;
499 case EOF:
500 throw new CSVException("EOF while processing escape sequence");
501 default:
502
503 if (isMetaChar(ch)) {
504 return ch;
505 }
506
507 return EOF;
508 }
509 }
510
511 void trimTrailingSpaces(final StringBuilder buffer) {
512 int length = buffer.length();
513 while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
514 length--;
515 }
516 if (length != buffer.length()) {
517 buffer.setLength(length);
518 }
519 }
520 }