1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.csv.Constants.BACKSPACE;
23 import static org.apache.commons.csv.Constants.CR;
24 import static org.apache.commons.csv.Constants.FF;
25 import static org.apache.commons.csv.Constants.LF;
26 import static org.apache.commons.csv.Constants.TAB;
27 import static org.apache.commons.csv.Token.Type.COMMENT;
28 import static org.apache.commons.csv.Token.Type.EOF;
29 import static org.apache.commons.csv.Token.Type.EORECORD;
30 import static org.apache.commons.csv.Token.Type.TOKEN;
31 import static org.junit.jupiter.api.Assertions.assertEquals;
32 import static org.junit.jupiter.api.Assertions.assertFalse;
33 import static org.junit.jupiter.api.Assertions.assertThrows;
34 import static org.junit.jupiter.api.Assertions.assertTrue;
35
36 import java.io.IOException;
37 import java.io.StringReader;
38
39 import org.junit.jupiter.api.BeforeEach;
40 import org.junit.jupiter.api.Test;
41
42
43
44 public class LexerTest {
45
46 private static void assertContent(final String expectedContent, final Token actualToken) {
47 assertEquals(expectedContent, actualToken.content.toString());
48 }
49
50 private static void assertNextToken(final String expectedContent, final Lexer lexer) throws IOException {
51 assertContent(expectedContent, lexer.nextToken(new Token()));
52 }
53
54 private static void assertNextToken(final Token.Type expectedType, final String expectedContent, final Lexer lexer) throws IOException {
55 final Token actualToken = lexer.nextToken(new Token());
56 assertEquals(expectedType, actualToken.type);
57 assertContent(expectedContent, actualToken);
58 }
59
60 private CSVFormat formatWithEscaping;
61
62 @SuppressWarnings("resource")
63 private Lexer createLexer(final String input, final CSVFormat format) {
64 return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
65 }
66
67 @BeforeEach
68 public void setUp() {
69 formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
70 }
71
72
73 @Test
74 public void testBackslashWithEscaping() throws IOException {
75
76
77
78 final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
79 final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
80 assertTrue(format.isEscapeCharacterSet());
81 try (Lexer lexer = createLexer(code, format)) {
82 assertNextToken(TOKEN, "a", lexer);
83 assertNextToken(TOKEN, ",", lexer);
84 assertNextToken(EORECORD, "b\\", lexer);
85 assertNextToken(TOKEN, ",", lexer);
86 assertNextToken(TOKEN, "\nc", lexer);
87 assertNextToken(EORECORD, "d\r", lexer);
88 assertNextToken(EOF, "e", lexer);
89 }
90 }
91
92
93 @Test
94 public void testBackslashWithoutEscaping() throws IOException {
95
96
97
98 final String code = "a,\\,,b\\\n\\,,";
99 final CSVFormat format = CSVFormat.DEFAULT;
100 assertFalse(format.isEscapeCharacterSet());
101 try (Lexer lexer = createLexer(code, format)) {
102
103 assertNextToken(TOKEN, "a", lexer);
104
105 assertNextToken(TOKEN, "\\", lexer);
106 assertNextToken(TOKEN, "", lexer);
107 assertNextToken(EORECORD, "b\\", lexer);
108
109 assertNextToken(TOKEN, "\\", lexer);
110 assertNextToken(TOKEN, "", lexer);
111 assertNextToken(EOF, "", lexer);
112 }
113 }
114
115 @Test
116 public void testBackspace() throws Exception {
117 try (Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
118 assertNextToken("character" + BACKSPACE + "NotEscaped", lexer);
119 }
120 }
121
122 @Test
123 public void testComments() throws IOException {
124 final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
125 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
126 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
127 try (Lexer lexer = createLexer(code, format)) {
128 assertNextToken(TOKEN, "first", lexer);
129 assertNextToken(TOKEN, "line", lexer);
130 assertNextToken(EORECORD, "", lexer);
131 assertNextToken(TOKEN, "second", lexer);
132 assertNextToken(TOKEN, "line", lexer);
133 assertNextToken(EORECORD, "tokenWith#no-comment", lexer);
134 assertNextToken(COMMENT, "comment line", lexer);
135 assertNextToken(TOKEN, "third", lexer);
136 assertNextToken(TOKEN, "line", lexer);
137 assertNextToken(EORECORD, "#no-comment", lexer);
138 assertNextToken(COMMENT, "penultimate comment", lexer);
139 assertNextToken(COMMENT, "Final comment", lexer);
140 assertNextToken(EOF, "", lexer);
141 assertNextToken(EOF, "", lexer);
142 }
143 }
144
145 @Test
146 public void testCommentsAndEmptyLines() throws IOException {
147 final String code = "1,2,3,\n" +
148 "\n" +
149 "\n" +
150 "a,b x,c#no-comment\n" +
151 "#foo\n" +
152 "\n" +
153 "\n" +
154 "d,e,#no-comment\n" +
155 "\n" +
156 "\n" +
157 "# penultimate comment\n" +
158 "\n" +
159 "\n" +
160 "# Final comment\n";
161 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
162 assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
163
164 try (Lexer lexer = createLexer(code, format)) {
165 assertNextToken(TOKEN, "1", lexer);
166 assertNextToken(TOKEN, "2", lexer);
167 assertNextToken(TOKEN, "3", lexer);
168 assertNextToken(EORECORD, "", lexer);
169 assertNextToken(EORECORD, "", lexer);
170 assertNextToken(EORECORD, "", lexer);
171 assertNextToken(TOKEN, "a", lexer);
172 assertNextToken(TOKEN, "b x", lexer);
173 assertNextToken(EORECORD, "c#no-comment", lexer);
174 assertNextToken(COMMENT, "foo", lexer);
175 assertNextToken(EORECORD, "", lexer);
176 assertNextToken(EORECORD, "", lexer);
177 assertNextToken(TOKEN, "d", lexer);
178 assertNextToken(TOKEN, "e", lexer);
179 assertNextToken(EORECORD, "#no-comment", lexer);
180 assertNextToken(EORECORD, "", lexer);
181 assertNextToken(EORECORD, "", lexer);
182 assertNextToken(COMMENT, "penultimate comment", lexer);
183 assertNextToken(EORECORD, "", lexer);
184 assertNextToken(EORECORD, "", lexer);
185 assertNextToken(COMMENT, "Final comment", lexer);
186 assertNextToken(EOF, "", lexer);
187 assertNextToken(EOF, "", lexer);
188 }
189 }
190
191 @Test
192 public void testCR() throws Exception {
193 try (Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
194 assertNextToken("character", lexer);
195 assertNextToken("NotEscaped", lexer);
196 }
197 }
198
199
200 @Test
201 public void testDelimiterIsWhitespace() throws IOException {
202 final String code = "one\ttwo\t\tfour \t five\t six";
203 try (Lexer lexer = createLexer(code, CSVFormat.TDF)) {
204 assertNextToken(TOKEN, "one", lexer);
205 assertNextToken(TOKEN, "two", lexer);
206 assertNextToken(TOKEN, "", lexer);
207 assertNextToken(TOKEN, "four", lexer);
208 assertNextToken(TOKEN, "five", lexer);
209 assertNextToken(EOF, "six", lexer);
210 }
211 }
212
213 @Test
214 public void testEOFWithoutClosingQuote() throws Exception {
215 final String code = "a,\"b";
216 try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).get())) {
217 assertNextToken(TOKEN, "a", lexer);
218 assertNextToken(EOF, "b", lexer);
219 }
220 try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).get())) {
221 assertNextToken(TOKEN, "a", lexer);
222 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
223 }
224 }
225
226 @Test
227 public void testEscapedBackspace() throws Exception {
228 try (Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
229 assertNextToken("character" + BACKSPACE + "Escaped", lexer);
230 }
231 }
232
233 @Test
234 public void testEscapedCharacter() throws Exception {
235 try (Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
236 assertNextToken("character\\aEscaped", lexer);
237 }
238 }
239
240 @Test
241 public void testEscapedControlCharacter() throws Exception {
242
243 try (Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
244 assertNextToken("character" + CR + "Escaped", lexer);
245 }
246 }
247
248 @Test
249 public void testEscapedControlCharacter2() throws Exception {
250 try (Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
251 assertNextToken("character" + CR + "Escaped", lexer);
252 }
253 }
254
255 @Test
256 public void testEscapedCR() throws Exception {
257 try (Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
258 assertNextToken("character" + CR + "Escaped", lexer);
259 }
260 }
261
262 @Test
263 public void testEscapedFF() throws Exception {
264 try (Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
265 assertNextToken("character" + FF + "Escaped", lexer);
266 }
267 }
268
269 @Test
270 public void testEscapedLF() throws Exception {
271 try (Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
272 assertNextToken("character" + LF + "Escaped", lexer);
273 }
274 }
275
276 @Test
277 public void testEscapedMySqlNullValue() throws Exception {
278
279 try (Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
280 assertNextToken("character\\NEscaped", lexer);
281 }
282 }
283
284 @Test
285 public void testEscapedTab() throws Exception {
286 try (Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
287 assertNextToken("character" + TAB + "Escaped", lexer);
288 }
289
290 }
291
292 @Test
293 public void testEscapingAtEOF() throws Exception {
294 final String code = "escaping at EOF is evil\\";
295 try (Lexer lexer = createLexer(code, formatWithEscaping)) {
296 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
297 }
298 }
299
300 @Test
301 public void testFF() throws Exception {
302 try (Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
303 assertNextToken("character" + FF + "NotEscaped", lexer);
304 }
305 }
306
307 @Test
308 public void testIgnoreEmptyLines() throws IOException {
309 final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
310 "\n" + "last, line \n" + "\n" + "\n" + "\n";
311 final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
312 try (Lexer lexer = createLexer(code, format)) {
313 assertNextToken(TOKEN, "first", lexer);
314 assertNextToken(TOKEN, "line", lexer);
315 assertNextToken(EORECORD, "", lexer);
316 assertNextToken(TOKEN, "second", lexer);
317 assertNextToken(EORECORD, "line", lexer);
318 assertNextToken(EORECORD, "third line ", lexer);
319 assertNextToken(TOKEN, "last", lexer);
320 assertNextToken(EORECORD, " line ", lexer);
321 assertNextToken(EOF, "", lexer);
322 assertNextToken(EOF, "", lexer);
323 }
324 }
325
326 @Test
327 public void testIsMetaCharCommentStart() throws IOException {
328 try (Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
329 final int ch = lexer.readEscape();
330 assertEquals('#', ch);
331 }
332 }
333
334 @Test
335 public void testLF() throws Exception {
336 try (Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
337 assertNextToken("character", lexer);
338 assertNextToken("NotEscaped", lexer);
339 }
340 }
341
342
343 @Test
344 public void testNextToken4() throws IOException {
345
346
347
348 final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
349 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
350 assertNextToken(TOKEN, "a", lexer);
351 assertNextToken(TOKEN, "foo", lexer);
352 assertNextToken(EORECORD, "b", lexer);
353 assertNextToken(TOKEN, "a", lexer);
354 assertNextToken(TOKEN, " foo", lexer);
355 assertNextToken(EORECORD, "b", lexer);
356 assertNextToken(TOKEN, "a", lexer);
357 assertNextToken(TOKEN, "foo ", lexer);
358 assertNextToken(EORECORD, "b", lexer);
359 assertNextToken(TOKEN, "a", lexer);
360 assertNextToken(TOKEN, " foo ", lexer);
361
362 assertNextToken(EOF, "b", lexer);
363 }
364 }
365
366
367 @Test
368 public void testNextToken5() throws IOException {
369 final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
370 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT)) {
371 assertNextToken(TOKEN, "a", lexer);
372 assertNextToken(TOKEN, "foo\n", lexer);
373 assertNextToken(EORECORD, "b", lexer);
374 assertNextToken(EORECORD, "foo\n baar ,,,", lexer);
375 assertNextToken(EOF, "\n\t \n", lexer);
376 }
377 }
378
379
380 @Test
381 public void testNextToken6() throws IOException {
382
383
384
385 final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
386 final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
387 try (Lexer lexer = createLexer(code, format)) {
388 assertNextToken(TOKEN, "a", lexer);
389 assertNextToken(EORECORD, "b and ' more\n", lexer);
390 }
391 }
392
393 @Test
394 public void testReadEscapeBackspace() throws IOException {
395 try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
396 final int ch = lexer.readEscape();
397 assertEquals(BACKSPACE, ch);
398 }
399 }
400
401 @Test
402 public void testReadEscapeFF() throws IOException {
403 try (Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
404 final int ch = lexer.readEscape();
405 assertEquals(FF, ch);
406 }
407 }
408
409 @Test
410 public void testReadEscapeTab() throws IOException {
411 try (Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
412 final int ch = lexer.readEscape();
413 assertNextToken(EOF, "", lexer);
414 assertEquals(TAB, ch);
415 }
416 }
417
418 @Test
419 public void testSurroundingSpacesAreDeleted() throws IOException {
420 final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
421 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
422 assertNextToken(TOKEN, "noSpaces", lexer);
423 assertNextToken(TOKEN, "leadingSpaces", lexer);
424 assertNextToken(TOKEN, "trailingSpaces", lexer);
425 assertNextToken(TOKEN, "surroundingSpaces", lexer);
426 assertNextToken(TOKEN, "", lexer);
427 assertNextToken(TOKEN, "", lexer);
428 assertNextToken(EOF, "", lexer);
429 }
430 }
431
432 @Test
433 public void testSurroundingTabsAreDeleted() throws IOException {
434 final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
435 try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
436 assertNextToken(TOKEN, "noTabs", lexer);
437 assertNextToken(TOKEN, "leadingTab", lexer);
438 assertNextToken(TOKEN, "trailingTab", lexer);
439 assertNextToken(TOKEN, "surroundingTabs", lexer);
440 assertNextToken(TOKEN, "", lexer);
441 assertNextToken(TOKEN, "", lexer);
442 assertNextToken(EOF, "", lexer);
443 }
444 }
445
446 @Test
447 public void testTab() throws Exception {
448 try (Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
449 assertNextToken("character" + TAB + "NotEscaped", lexer);
450 }
451 }
452
453 @Test
454 public void testTrailingTextAfterQuote() throws Exception {
455 final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
456 try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).get())) {
457 assertNextToken(TOKEN, "a b", lexer);
458 assertNextToken(TOKEN, "a \" b", lexer);
459 assertNextToken(EOF, "a b \"\"", lexer);
460 }
461 try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).get())) {
462 assertThrows(IOException.class, () -> parser.nextToken(new Token()));
463 }
464 }
465
466 @Test
467 public void testTrimTrailingSpacesZeroLength() throws Exception {
468 final StringBuilder buffer = new StringBuilder("");
469 try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
470 lexer.trimTrailingSpaces(buffer);
471 assertNextToken(EOF, "", lexer);
472 }
473 }
474 }