1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.csv.Constants.BACKSPACE;
23 import static org.apache.commons.csv.Constants.CR;
24 import static org.apache.commons.csv.Constants.FF;
25 import static org.apache.commons.csv.Constants.LF;
26 import static org.apache.commons.csv.Constants.TAB;
27 import static org.apache.commons.csv.Token.Type.COMMENT;
28 import static org.apache.commons.csv.Token.Type.EOF;
29 import static org.apache.commons.csv.Token.Type.EORECORD;
30 import static org.apache.commons.csv.Token.Type.TOKEN;
31 import static org.junit.jupiter.api.Assertions.assertEquals;
32 import static org.junit.jupiter.api.Assertions.assertFalse;
33 import static org.junit.jupiter.api.Assertions.assertThrows;
34 import static org.junit.jupiter.api.Assertions.assertTrue;
35
36 import java.io.IOException;
37 import java.io.StringReader;
38
39 import org.junit.jupiter.api.BeforeEach;
40 import org.junit.jupiter.api.Test;
41
42
43
44 class LexerTest {
45
46 private static void assertContent(final String expectedContent, final Token actualToken) {
47 assertEquals(expectedContent, actualToken.content.toString());
48 }
49
50 private static void assertNextToken(final String expectedContent, final Lexer lexer) throws IOException {
51 assertContent(expectedContent, lexer.nextToken(new Token()));
52 }
53
54 private static void assertNextToken(final Token.Type expectedType, final String expectedContent, final Lexer lexer) throws IOException {
55 final Token actualToken = lexer.nextToken(new Token());
56 assertEquals(expectedType, actualToken.type);
57 assertContent(expectedContent, actualToken);
58 }
59
60 private CSVFormat formatWithEscaping;
61
62 @SuppressWarnings("resource")
63 private Lexer createLexer(final String input, final CSVFormat format) {
64 return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
65 }
66
67 @BeforeEach
68 public void setUp() {
69 formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
70 }
71
72
73 @Test
74 void testBackslashWithEscaping() throws IOException {
75
76
77
78 final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
79 final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
80 assertTrue(format.isEscapeCharacterSet());
81 try (Lexer lexer = createLexer(code, format)) {
82 assertNextToken(TOKEN, "a", lexer);
83 assertNextToken(TOKEN, ",", lexer);
84 assertNextToken(EORECORD, "b\\", lexer);
85 assertNextToken(TOKEN, ",", lexer);
86 assertNextToken(TOKEN, "\nc", lexer);
87 assertNextToken(EORECORD, "d\r", lexer);
88 assertNextToken(EOF, "e", lexer);
89 }
90 }
91
92
93 @Test
94 void testBackslashWithoutEscaping() throws IOException {
95
96
97
98 final String code = "a,\\,,b\\\n\\,,";
99 final CSVFormat format = CSVFormat.DEFAULT;
100 assertFalse(format.isEscapeCharacterSet());
101 try (Lexer lexer = createLexer(code, format)) {
102
103 assertNextToken(TOKEN, "a", lexer);
104
105 assertNextToken(TOKEN, "\\", lexer);
106 assertNextToken(TOKEN, "", lexer);
107 assertNextToken(EORECORD, "b\\", lexer);
108
109 assertNextToken(TOKEN, "\\", lexer);
110 assertNextToken(TOKEN, "", lexer);
111 assertNextToken(EOF, "", lexer);
112 }
113 }
114
115 @Test
116 void testBackspace() throws Exception {
117 try (Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
118 assertNextToken("character" + BACKSPACE + "NotEscaped", lexer);
119 }
120 }
121
122 @Test
123 void testComments() throws IOException {
124
125 final String code = "first,line,\n" +
126 "second,line,tokenWith#no-comment\n" +
127 "# comment line \n" +
128 "third,line,#no-comment\n" +
129 "# penultimate comment\n" +
130 "# Final comment\n";
131
132 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
133 try (Lexer lexer = createLexer(code, format)) {
134 assertNextToken(TOKEN, "first", lexer);
135 assertNextToken(TOKEN, "line", lexer);
136 assertNextToken(EORECORD, "", lexer);
137 assertNextToken(TOKEN, "second", lexer);
138 assertNextToken(TOKEN, "line", lexer);
139 assertNextToken(EORECORD, "tokenWith#no-comment", lexer);
140 assertNextToken(COMMENT, "comment line", lexer);
141 assertNextToken(TOKEN, "third", lexer);
142 assertNextToken(TOKEN, "line", lexer);
143 assertNextToken(EORECORD, "#no-comment", lexer);
144 assertNextToken(COMMENT, "penultimate comment", lexer);
145 assertNextToken(COMMENT, "Final comment", lexer);
146 assertNextToken(EOF, "", lexer);
147 assertNextToken(EOF, "", lexer);
148 }
149 }
150
151 @Test
152 void testCommentsAndEmptyLines() throws IOException {
153 final String code = "1,2,3,\n" +
154 "\n" +
155 "\n" +
156 "a,b x,c#no-comment\n" +
157 "#foo\n" +
158 "\n" +
159 "\n" +
160 "d,e,#no-comment\n" +
161 "\n" +
162 "\n" +
163 "# penultimate comment\n" +
164 "\n" +
165 "\n" +
166 "# Final comment\n";
167 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
168 assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
169
170 try (Lexer lexer = createLexer(code, format)) {
171 assertNextToken(TOKEN, "1", lexer);
172 assertNextToken(TOKEN, "2", lexer);
173 assertNextToken(TOKEN, "3", lexer);
174 assertNextToken(EORECORD, "", lexer);
175 assertNextToken(EORECORD, "", lexer);
176 assertNextToken(EORECORD, "", lexer);
177 assertNextToken(TOKEN, "a", lexer);
178 assertNextToken(TOKEN, "b x", lexer);
179 assertNextToken(EORECORD, "c#no-comment", lexer);
180 assertNextToken(COMMENT, "foo", lexer);
181 assertNextToken(EORECORD, "", lexer);
182 assertNextToken(EORECORD, "", lexer);
183 assertNextToken(TOKEN, "d", lexer);
184 assertNextToken(TOKEN, "e", lexer);
185 assertNextToken(EORECORD, "#no-comment", lexer);
186 assertNextToken(EORECORD, "", lexer);
187 assertNextToken(EORECORD, "", lexer);
188 assertNextToken(COMMENT, "penultimate comment", lexer);
189 assertNextToken(EORECORD, "", lexer);
190 assertNextToken(EORECORD, "", lexer);
191 assertNextToken(COMMENT, "Final comment", lexer);
192 assertNextToken(EOF, "", lexer);
193 assertNextToken(EOF, "", lexer);
194 }
195 }
196
197 @Test
198 void testCR() throws Exception {
199 try (Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
200 assertNextToken("character", lexer);
201 assertNextToken("NotEscaped", lexer);
202 }
203 }
204
205
206 @Test
207 void testDelimiterIsWhitespace() throws IOException {
208 final String code = "one\ttwo\t\tfour \t five\t six";
209 try (Lexer lexer = createLexer(code, CSVFormat.TDF)) {
210 assertNextToken(TOKEN, "one", lexer);
211 assertNextToken(TOKEN, "two", lexer);
212 assertNextToken(TOKEN, "", lexer);
213 assertNextToken(TOKEN, "four", lexer);
214 assertNextToken(TOKEN, "five", lexer);
215 assertNextToken(EOF, "six", lexer);
216 }
217 }
218
219 @Test
220 void testEOFWithoutClosingQuote() throws Exception {
221 final String code = "a,\"b";
222 try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).get())) {
223 assertNextToken(TOKEN, "a", lexer);
224 assertNextToken(EOF, "b", lexer);
225 }
226 try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).get())) {
227 assertNextToken(TOKEN, "a", lexer);
228 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
229 }
230 }
231
232 @Test
233 void testEscapedBackspace() throws Exception {
234 try (Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
235 assertNextToken("character" + BACKSPACE + "Escaped", lexer);
236 }
237 }
238
239 @Test
240 void testEscapedCharacter() throws Exception {
241 try (Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
242 assertNextToken("character\\aEscaped", lexer);
243 }
244 }
245
246 @Test
247 void testEscapedControlCharacter() throws Exception {
248
249 try (Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
250 assertNextToken("character" + CR + "Escaped", lexer);
251 }
252 }
253
254 @Test
255 void testEscapedControlCharacter2() throws Exception {
256 try (Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
257 assertNextToken("character" + CR + "Escaped", lexer);
258 }
259 }
260
261 @Test
262 void testEscapedCR() throws Exception {
263 try (Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
264 assertNextToken("character" + CR + "Escaped", lexer);
265 }
266 }
267
268 @Test
269 void testEscapedFF() throws Exception {
270 try (Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
271 assertNextToken("character" + FF + "Escaped", lexer);
272 }
273 }
274
275 @Test
276 void testEscapedLF() throws Exception {
277 try (Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
278 assertNextToken("character" + LF + "Escaped", lexer);
279 }
280 }
281
282 @Test
283 void testEscapedMySqlNullValue() throws Exception {
284
285 try (Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
286 assertNextToken("character\\NEscaped", lexer);
287 }
288 }
289
290 @Test
291 void testEscapedTab() throws Exception {
292 try (Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
293 assertNextToken("character" + TAB + "Escaped", lexer);
294 }
295
296 }
297
298 @Test
299 void testEscapingAtEOF() throws Exception {
300 final String code = "escaping at EOF is evil\\";
301 try (Lexer lexer = createLexer(code, formatWithEscaping)) {
302 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
303 }
304 }
305
306 @Test
307 void testFF() throws Exception {
308 try (Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
309 assertNextToken("character" + FF + "NotEscaped", lexer);
310 }
311 }
312
313 @Test
314 void testIgnoreEmptyLines() throws IOException {
315
316 final String code = "first,line,\n" +
317 "\n" +
318 "\n" +
319 "second,line\n" +
320 "\n" +
321 "\n" +
322 "third line \n" +
323 "\n" +
324 "\n" +
325 "last, line \n" +
326 "\n" +
327 "\n" +
328 "\n";
329
330 final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
331 try (Lexer lexer = createLexer(code, format)) {
332 assertNextToken(TOKEN, "first", lexer);
333 assertNextToken(TOKEN, "line", lexer);
334 assertNextToken(EORECORD, "", lexer);
335 assertNextToken(TOKEN, "second", lexer);
336 assertNextToken(EORECORD, "line", lexer);
337 assertNextToken(EORECORD, "third line ", lexer);
338 assertNextToken(TOKEN, "last", lexer);
339 assertNextToken(EORECORD, " line ", lexer);
340 assertNextToken(EOF, "", lexer);
341 assertNextToken(EOF, "", lexer);
342 }
343 }
344
345 @Test
346 void testIsMetaCharCommentStart() throws IOException {
347 try (Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
348 final int ch = lexer.readEscape();
349 assertEquals('#', ch);
350 }
351 }
352
353 @Test
354 void testLF() throws Exception {
355 try (Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
356 assertNextToken("character", lexer);
357 assertNextToken("NotEscaped", lexer);
358 }
359 }
360
361
362 @Test
363 void testNextToken4() throws IOException {
364
365
366
367 final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
368 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
369 assertNextToken(TOKEN, "a", lexer);
370 assertNextToken(TOKEN, "foo", lexer);
371 assertNextToken(EORECORD, "b", lexer);
372 assertNextToken(TOKEN, "a", lexer);
373 assertNextToken(TOKEN, " foo", lexer);
374 assertNextToken(EORECORD, "b", lexer);
375 assertNextToken(TOKEN, "a", lexer);
376 assertNextToken(TOKEN, "foo ", lexer);
377 assertNextToken(EORECORD, "b", lexer);
378 assertNextToken(TOKEN, "a", lexer);
379 assertNextToken(TOKEN, " foo ", lexer);
380
381 assertNextToken(EOF, "b", lexer);
382 }
383 }
384
385
386 @Test
387 void testNextToken5() throws IOException {
388 final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
389 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT)) {
390 assertNextToken(TOKEN, "a", lexer);
391 assertNextToken(TOKEN, "foo\n", lexer);
392 assertNextToken(EORECORD, "b", lexer);
393 assertNextToken(EORECORD, "foo\n baar ,,,", lexer);
394 assertNextToken(EOF, "\n\t \n", lexer);
395 }
396 }
397
398
399 @Test
400 void testNextToken6() throws IOException {
401
402
403
404 final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
405 final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
406 try (Lexer lexer = createLexer(code, format)) {
407 assertNextToken(TOKEN, "a", lexer);
408 assertNextToken(EORECORD, "b and ' more\n", lexer);
409 }
410 }
411
412 @Test
413 void testReadEscapeBackspace() throws IOException {
414 try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
415 final int ch = lexer.readEscape();
416 assertEquals(BACKSPACE, ch);
417 }
418 }
419
420 @Test
421 void testReadEscapeFF() throws IOException {
422 try (Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
423 final int ch = lexer.readEscape();
424 assertEquals(FF, ch);
425 }
426 }
427
428 @Test
429 void testReadEscapeTab() throws IOException {
430 try (Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
431 final int ch = lexer.readEscape();
432 assertNextToken(EOF, "", lexer);
433 assertEquals(TAB, ch);
434 }
435 }
436
437 @Test
438 void testSurroundingSpacesAreDeleted() throws IOException {
439 final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
440 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
441 assertNextToken(TOKEN, "noSpaces", lexer);
442 assertNextToken(TOKEN, "leadingSpaces", lexer);
443 assertNextToken(TOKEN, "trailingSpaces", lexer);
444 assertNextToken(TOKEN, "surroundingSpaces", lexer);
445 assertNextToken(TOKEN, "", lexer);
446 assertNextToken(TOKEN, "", lexer);
447 assertNextToken(EOF, "", lexer);
448 }
449 }
450
451 @Test
452 void testSurroundingTabsAreDeleted() throws IOException {
453 final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
454 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
455 assertNextToken(TOKEN, "noTabs", lexer);
456 assertNextToken(TOKEN, "leadingTab", lexer);
457 assertNextToken(TOKEN, "trailingTab", lexer);
458 assertNextToken(TOKEN, "surroundingTabs", lexer);
459 assertNextToken(TOKEN, "", lexer);
460 assertNextToken(TOKEN, "", lexer);
461 assertNextToken(EOF, "", lexer);
462 }
463 }
464
465 @Test
466 void testTab() throws Exception {
467 try (Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
468 assertNextToken("character" + TAB + "NotEscaped", lexer);
469 }
470 }
471
472 @Test
473 void testTrailingTextAfterQuote() throws Exception {
474 final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
475 try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).get())) {
476 assertNextToken(TOKEN, "a b", lexer);
477 assertNextToken(TOKEN, "a \" b", lexer);
478 assertNextToken(EOF, "a b \"\"", lexer);
479 }
480 try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).get())) {
481 assertThrows(IOException.class, () -> parser.nextToken(new Token()));
482 }
483 }
484
485 @Test
486 void testTrimTrailingSpacesZeroLength() throws Exception {
487 final StringBuilder buffer = new StringBuilder("");
488 try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
489 lexer.trimTrailingSpaces(buffer);
490 assertNextToken(EOF, "", lexer);
491 }
492 }
493 }