1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.csv;
19
20 import static org.apache.commons.csv.Constants.BACKSPACE;
21 import static org.apache.commons.csv.Constants.CR;
22 import static org.apache.commons.csv.Constants.END_OF_STREAM;
23 import static org.apache.commons.csv.Constants.FF;
24 import static org.apache.commons.csv.Constants.LF;
25 import static org.apache.commons.csv.Constants.TAB;
26 import static org.apache.commons.csv.Constants.UNDEFINED;
27 import static org.apache.commons.csv.Token.Type.COMMENT;
28 import static org.apache.commons.csv.Token.Type.EOF;
29 import static org.apache.commons.csv.Token.Type.EORECORD;
30 import static org.apache.commons.csv.Token.Type.INVALID;
31 import static org.apache.commons.csv.Token.Type.TOKEN;
32
33 import java.io.Closeable;
34 import java.io.IOException;
35
36
37
38
39 final class Lexer implements Closeable {
40
41 private static final String CR_STRING = Character.toString(CR);
42 private static final String LF_STRING = Character.toString(LF);
43
44
45
46
47
48
49 private static final char DISABLED = '\ufffe';
50
51 private final char[] delimiter;
52 private final char[] delimiterBuf;
53 private final char[] escapeDelimiterBuf;
54 private final char escape;
55 private final char quoteChar;
56 private final char commentStart;
57
58 private final boolean ignoreSurroundingSpaces;
59 private final boolean ignoreEmptyLines;
60
61
62 private final ExtendedBufferedReader reader;
63 private String firstEol;
64
65 private boolean isLastTokenDelimiter;
66
67 Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
68 this.reader = reader;
69 this.delimiter = format.getDelimiterString().toCharArray();
70 this.escape = mapNullToDisabled(format.getEscapeCharacter());
71 this.quoteChar = mapNullToDisabled(format.getQuoteCharacter());
72 this.commentStart = mapNullToDisabled(format.getCommentMarker());
73 this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
74 this.ignoreEmptyLines = format.getIgnoreEmptyLines();
75 this.delimiterBuf = new char[delimiter.length - 1];
76 this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
77 }
78
79
80
81
82
83
84
85 @Override
86 public void close() throws IOException {
87 reader.close();
88 }
89
90
91
92
93
94
95 long getCharacterPosition() {
96 return reader.getPosition();
97 }
98
99
100
101
102
103
104 long getCurrentLineNumber() {
105 return reader.getCurrentLineNumber();
106 }
107
108 String getFirstEol(){
109 return firstEol;
110 }
111
112 boolean isClosed() {
113 return reader.isClosed();
114 }
115
116 boolean isCommentStart(final int ch) {
117 return ch == commentStart;
118 }
119
120
121
122
123
124
125
126
127
128 boolean isDelimiter(final int ch) throws IOException {
129 isLastTokenDelimiter = false;
130 if (ch != delimiter[0]) {
131 return false;
132 }
133 if (delimiter.length == 1) {
134 isLastTokenDelimiter = true;
135 return true;
136 }
137 reader.lookAhead(delimiterBuf);
138 for (int i = 0; i < delimiterBuf.length; i++) {
139 if (delimiterBuf[i] != delimiter[i+1]) {
140 return false;
141 }
142 }
143 final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
144 isLastTokenDelimiter = count != END_OF_STREAM;
145 return isLastTokenDelimiter;
146 }
147
148
149
150
151
152
153 boolean isEndOfFile(final int ch) {
154 return ch == END_OF_STREAM;
155 }
156
157
158
159
160
161
162 boolean isEscape(final int ch) {
163 return ch == escape;
164 }
165
166
167
168
169
170
171
172
173
174 boolean isEscapeDelimiter() throws IOException {
175 reader.lookAhead(escapeDelimiterBuf);
176 if (escapeDelimiterBuf[0] != delimiter[0]) {
177 return false;
178 }
179 for (int i = 1; i < delimiter.length; i++) {
180 if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
181 return false;
182 }
183 }
184 final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
185 return count != END_OF_STREAM;
186 }
187
188 private boolean isMetaChar(final int ch) {
189 return ch == escape || ch == quoteChar || ch == commentStart;
190 }
191
192 boolean isQuoteChar(final int ch) {
193 return ch == quoteChar;
194 }
195
196
197
198
199
200
201
202 boolean isStartOfLine(final int ch) {
203 return ch == LF || ch == CR || ch == UNDEFINED;
204 }
205
206 private char mapNullToDisabled(final Character c) {
207 return c == null ? DISABLED : c.charValue();
208 }
209
210
211
212
213
214
215
216
217
218
219
220
221 Token nextToken(final Token token) throws IOException {
222
223
224 int lastChar = reader.getLastChar();
225
226
227 int c = reader.read();
228
229
230
231
232 boolean eol = readEndOfLine(c);
233
234
235 if (ignoreEmptyLines) {
236 while (eol && isStartOfLine(lastChar)) {
237
238 lastChar = c;
239 c = reader.read();
240 eol = readEndOfLine(c);
241
242 if (isEndOfFile(c)) {
243 token.type = EOF;
244
245 return token;
246 }
247 }
248 }
249
250
251 if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
252 token.type = EOF;
253
254 return token;
255 }
256
257 if (isStartOfLine(lastChar) && isCommentStart(c)) {
258 final String line = reader.readLine();
259 if (line == null) {
260 token.type = EOF;
261
262 return token;
263 }
264 final String comment = line.trim();
265 token.content.append(comment);
266 token.type = COMMENT;
267 return token;
268 }
269
270
271 while (token.type == INVALID) {
272
273 if (ignoreSurroundingSpaces) {
274 while (Character.isWhitespace((char)c) && !isDelimiter(c) && !eol) {
275 c = reader.read();
276 eol = readEndOfLine(c);
277 }
278 }
279
280
281 if (isDelimiter(c)) {
282
283 token.type = TOKEN;
284 } else if (eol) {
285
286
287 token.type = EORECORD;
288 } else if (isQuoteChar(c)) {
289
290 parseEncapsulatedToken(token);
291 } else if (isEndOfFile(c)) {
292
293
294 token.type = EOF;
295 token.isReady = true;
296 } else {
297
298
299 parseSimpleToken(token, c);
300 }
301 }
302 return token;
303 }
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326 private Token parseEncapsulatedToken(final Token token) throws IOException {
327 token.isQuoted = true;
328
329 final long startLineNumber = getCurrentLineNumber();
330 int c;
331 while (true) {
332 c = reader.read();
333
334 if (isEscape(c)) {
335 if (isEscapeDelimiter()) {
336 token.content.append(delimiter);
337 } else {
338 final int unescaped = readEscape();
339 if (unescaped == END_OF_STREAM) {
340 token.content.append((char) c).append((char) reader.getLastChar());
341 } else {
342 token.content.append((char) unescaped);
343 }
344 }
345 } else if (isQuoteChar(c)) {
346 if (isQuoteChar(reader.lookAhead())) {
347
348 c = reader.read();
349 token.content.append((char) c);
350 } else {
351
352 while (true) {
353 c = reader.read();
354 if (isDelimiter(c)) {
355 token.type = TOKEN;
356 return token;
357 }
358 if (isEndOfFile(c)) {
359 token.type = EOF;
360 token.isReady = true;
361 return token;
362 }
363 if (readEndOfLine(c)) {
364 token.type = EORECORD;
365 return token;
366 }
367 if (!Character.isWhitespace((char)c)) {
368
369 throw new IOException("(line " + getCurrentLineNumber() +
370 ") invalid char between encapsulated token and delimiter");
371 }
372 }
373 }
374 } else if (isEndOfFile(c)) {
375
376 throw new IOException("(startline " + startLineNumber +
377 ") EOF reached before encapsulated token finished");
378 } else {
379
380 token.content.append((char) c);
381 }
382 }
383 }
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405 private Token parseSimpleToken(final Token token, int ch) throws IOException {
406
407 while (true) {
408 if (readEndOfLine(ch)) {
409 token.type = EORECORD;
410 break;
411 }
412 if (isEndOfFile(ch)) {
413 token.type = EOF;
414 token.isReady = true;
415 break;
416 }
417 if (isDelimiter(ch)) {
418 token.type = TOKEN;
419 break;
420 }
421
422 if (isEscape(ch)) {
423 if (isEscapeDelimiter()) {
424 token.content.append(delimiter);
425 } else {
426 final int unescaped = readEscape();
427 if (unescaped == END_OF_STREAM) {
428 token.content.append((char) ch).append((char) reader.getLastChar());
429 } else {
430 token.content.append((char) unescaped);
431 }
432 }
433 } else {
434 token.content.append((char) ch);
435 }
436 ch = reader.read();
437 }
438
439 if (ignoreSurroundingSpaces) {
440 trimTrailingSpaces(token.content);
441 }
442
443 return token;
444 }
445
446
447
448
449
450
451 boolean readEndOfLine(int ch) throws IOException {
452
453 if (ch == CR && reader.lookAhead() == LF) {
454
455 ch = reader.read();
456
457 if (firstEol == null) {
458 this.firstEol = Constants.CRLF;
459 }
460 }
461
462 if (firstEol == null) {
463 if (ch == LF) {
464 this.firstEol = LF_STRING;
465 } else if (ch == CR) {
466 this.firstEol = CR_STRING;
467 }
468 }
469
470 return ch == LF || ch == CR;
471 }
472
473
474
475
476
477
478
479
480
481
482
483
484
485 int readEscape() throws IOException {
486
487 final int ch = reader.read();
488 switch (ch) {
489 case 'r':
490 return CR;
491 case 'n':
492 return LF;
493 case 't':
494 return TAB;
495 case 'b':
496 return BACKSPACE;
497 case 'f':
498 return FF;
499 case CR:
500 case LF:
501 case FF:
502 case TAB:
503 case BACKSPACE:
504 return ch;
505 case END_OF_STREAM:
506 throw new IOException("EOF whilst processing escape sequence");
507 default:
508
509 if (isMetaChar(ch)) {
510 return ch;
511 }
512
513 return END_OF_STREAM;
514 }
515 }
516
517 void trimTrailingSpaces(final StringBuilder buffer) {
518 int length = buffer.length();
519 while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
520 length = length - 1;
521 }
522 if (length != buffer.length()) {
523 buffer.setLength(length);
524 }
525 }
526 }