1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.csv.Constants.BACKSPACE;
23 import static org.apache.commons.csv.Constants.CR;
24 import static org.apache.commons.csv.Constants.FF;
25 import static org.apache.commons.csv.Constants.LF;
26 import static org.apache.commons.csv.Constants.TAB;
27 import static org.apache.commons.csv.Token.Type.COMMENT;
28 import static org.apache.commons.csv.Token.Type.EOF;
29 import static org.apache.commons.csv.Token.Type.EORECORD;
30 import static org.apache.commons.csv.Token.Type.TOKEN;
31 import static org.apache.commons.csv.TokenMatchers.hasContent;
32 import static org.apache.commons.csv.TokenMatchers.matches;
33 import static org.hamcrest.MatcherAssert.assertThat;
34 import static org.junit.jupiter.api.Assertions.assertEquals;
35 import static org.junit.jupiter.api.Assertions.assertFalse;
36 import static org.junit.jupiter.api.Assertions.assertThrows;
37 import static org.junit.jupiter.api.Assertions.assertTrue;
38
39 import java.io.IOException;
40 import java.io.StringReader;
41
42 import org.junit.jupiter.api.BeforeEach;
43 import org.junit.jupiter.api.Test;
44
45
46
47 public class LexerTest {
48
49 private CSVFormat formatWithEscaping;
50
51 @SuppressWarnings("resource")
52 private Lexer createLexer(final String input, final CSVFormat format) {
53 return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
54 }
55
56 @BeforeEach
57 public void setUp() {
58 formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
59 }
60
61
62 @Test
63 public void testBackslashWithEscaping() throws IOException {
64
65
66
67 final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
68 final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
69 assertTrue(format.isEscapeCharacterSet());
70 try (Lexer parser = createLexer(code, format)) {
71 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
72 assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
73 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
74 assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
75 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
76 assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
77 assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
78 }
79 }
80
81
82 @Test
83 public void testBackslashWithoutEscaping() throws IOException {
84
85
86
87 final String code = "a,\\,,b\\\n\\,,";
88 final CSVFormat format = CSVFormat.DEFAULT;
89 assertFalse(format.isEscapeCharacterSet());
90 try (Lexer parser = createLexer(code, format)) {
91 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
92
93 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
94 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
95 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
96
97 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
98 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
99 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
100 }
101 }
102
103 @Test
104 public void testBackspace() throws Exception {
105 try (Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
106 assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
107 }
108 }
109
110 @Test
111 public void testComments() throws IOException {
112 final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
113 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
114 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
115 try (Lexer parser = createLexer(code, format)) {
116 assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
117 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
118 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
119 assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
120 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
121 assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
122 assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
123 assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
124 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
125 assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
126 assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
127 assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
128 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
129 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
130 }
131 }
132
133 @Test
134 public void testCommentsAndEmptyLines() throws IOException {
135 final String code = "1,2,3,\n" +
136 "\n" +
137 "\n" +
138 "a,b x,c#no-comment\n" +
139 "#foo\n" +
140 "\n" +
141 "\n" +
142 "d,e,#no-comment\n" +
143 "\n" +
144 "\n" +
145 "# penultimate comment\n" +
146 "\n" +
147 "\n" +
148 "# Final comment\n";
149 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
150 assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
151
152 try (Lexer parser = createLexer(code, format)) {
153 assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
154 assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
155 assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
156 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
157 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
158 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
159 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
160 assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
161 assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment"));
162 assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo"));
163 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
164 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
165 assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
166 assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
167 assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
168 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
169 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
170 assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
171 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
172 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
173 assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
174 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
175 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
176 }
177 }
178
179 @Test
180 public void testCR() throws Exception {
181 try (Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
182 assertThat(lexer.nextToken(new Token()), hasContent("character"));
183 assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
184 }
185 }
186
187
188 @Test
189 public void testDelimiterIsWhitespace() throws IOException {
190 final String code = "one\ttwo\t\tfour \t five\t six";
191 try (Lexer parser = createLexer(code, CSVFormat.TDF)) {
192 assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
193 assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
194 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
195 assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
196 assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
197 assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
198 }
199 }
200
201 @Test
202 public void testEOFWithoutClosingQuote() throws Exception {
203 final String code = "a,\"b";
204 try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).get())) {
205 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
206 assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
207 }
208 try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).get())) {
209 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
210 assertThrows(IOException.class, () -> parser.nextToken(new Token()));
211 }
212 }
213
214 @Test
215 public void testEscapedBackspace() throws Exception {
216 try (Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
217 assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
218 }
219 }
220
221 @Test
222 public void testEscapedCharacter() throws Exception {
223 try (Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
224 assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
225 }
226 }
227
228 @Test
229 public void testEscapedControlCharacter() throws Exception {
230
231 try (Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
232 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
233 }
234 }
235
236 @Test
237 public void testEscapedControlCharacter2() throws Exception {
238 try (Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
239 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
240 }
241 }
242
243 @Test
244 public void testEscapedCR() throws Exception {
245 try (Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
246 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
247 }
248 }
249
250 @Test
251 public void testEscapedFF() throws Exception {
252 try (Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
253 assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
254 }
255 }
256
257 @Test
258 public void testEscapedLF() throws Exception {
259 try (Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
260 assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
261 }
262 }
263
264 @Test
265 public void testEscapedMySqlNullValue() throws Exception {
266
267 try (Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
268 assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
269 }
270 }
271
272 @Test
273 public void testEscapedTab() throws Exception {
274 try (Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
275 assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
276 }
277
278 }
279
280 @Test
281 public void testEscapingAtEOF() throws Exception {
282 final String code = "escaping at EOF is evil\\";
283 try (Lexer lexer = createLexer(code, formatWithEscaping)) {
284 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
285 }
286 }
287
288 @Test
289 public void testFF() throws Exception {
290 try (Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
291 assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
292 }
293 }
294
295 @Test
296 public void testIgnoreEmptyLines() throws IOException {
297 final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
298 "\n" + "last, line \n" + "\n" + "\n" + "\n";
299 final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
300 try (Lexer parser = createLexer(code, format)) {
301 assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
302 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
303 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
304 assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
305 assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
306 assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
307 assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
308 assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
309 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
310 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
311 }
312 }
313
314 @Test
315 public void testIsMetaCharCommentStart() throws IOException {
316 try (Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
317 final int ch = lexer.readEscape();
318 assertEquals('#', ch);
319 }
320 }
321
322 @Test
323 public void testLF() throws Exception {
324 try (Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
325 assertThat(lexer.nextToken(new Token()), hasContent("character"));
326 assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
327 }
328 }
329
330
331 @Test
332 public void testNextToken4() throws IOException {
333
334
335
336 final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
337 try (Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
338 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
339 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
340 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
341 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
342 assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
343 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
344 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
345 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
346 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
347 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
348 assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
349
350 assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
351 }
352 }
353
354
355 @Test
356 public void testNextToken5() throws IOException {
357 final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
358 try (Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
359 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
360 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
361 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
362 assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n baar ,,,"));
363 assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
364 }
365 }
366
367
368 @Test
369 public void testNextToken6() throws IOException {
370
371
372
373 final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
374 final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
375 try (Lexer parser = createLexer(code, format)) {
376 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
377 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
378 }
379 }
380
381 @Test
382 public void testReadEscapeBackspace() throws IOException {
383 try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
384 final int ch = lexer.readEscape();
385 assertEquals(BACKSPACE, ch);
386 }
387 }
388
389 @Test
390 public void testReadEscapeFF() throws IOException {
391 try (Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
392 final int ch = lexer.readEscape();
393 assertEquals(FF, ch);
394 }
395 }
396
397 @Test
398 public void testReadEscapeTab() throws IOException {
399 try (Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
400 final int ch = lexer.readEscape();
401 assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
402 assertEquals(TAB, ch);
403 }
404 }
405
406 @Test
407 public void testSurroundingSpacesAreDeleted() throws IOException {
408 final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
409 try (Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
410 assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
411 assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
412 assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
413 assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
414 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
415 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
416 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
417 }
418 }
419
420 @Test
421 public void testSurroundingTabsAreDeleted() throws IOException {
422 final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
423 try (Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
424 assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
425 assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
426 assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
427 assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
428 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
429 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
430 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
431 }
432 }
433
434 @Test
435 public void testTab() throws Exception {
436 try (Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
437 assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
438 }
439 }
440
441 @Test
442 public void testTrailingTextAfterQuote() throws Exception {
443 final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
444 try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).get())) {
445 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
446 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
447 assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
448 }
449 try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).get())) {
450 assertThrows(IOException.class, () -> parser.nextToken(new Token()));
451 }
452 }
453
454 @Test
455 public void testTrimTrailingSpacesZeroLength() throws Exception {
456 final StringBuilder buffer = new StringBuilder("");
457 try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
458 lexer.trimTrailingSpaces(buffer);
459 assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
460 }
461 }
462 }