1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.csv;
19
20 import static org.apache.commons.csv.Constants.BACKSPACE;
21 import static org.apache.commons.csv.Constants.CR;
22 import static org.apache.commons.csv.Constants.FF;
23 import static org.apache.commons.csv.Constants.LF;
24 import static org.apache.commons.csv.Constants.TAB;
25 import static org.apache.commons.csv.Token.Type.COMMENT;
26 import static org.apache.commons.csv.Token.Type.EOF;
27 import static org.apache.commons.csv.Token.Type.EORECORD;
28 import static org.apache.commons.csv.Token.Type.TOKEN;
29 import static org.apache.commons.csv.TokenMatchers.hasContent;
30 import static org.apache.commons.csv.TokenMatchers.matches;
31 import static org.hamcrest.MatcherAssert.assertThat;
32 import static org.junit.jupiter.api.Assertions.assertEquals;
33 import static org.junit.jupiter.api.Assertions.assertFalse;
34 import static org.junit.jupiter.api.Assertions.assertThrows;
35 import static org.junit.jupiter.api.Assertions.assertTrue;
36
37 import java.io.IOException;
38 import java.io.StringReader;
39
40 import org.junit.jupiter.api.BeforeEach;
41 import org.junit.jupiter.api.Test;
42
43
44
45
46 public class LexerTest {
47
48 private CSVFormat formatWithEscaping;
49
50 @SuppressWarnings("resource")
51 private Lexer createLexer(final String input, final CSVFormat format) {
52 return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
53 }
54
55 @BeforeEach
56 public void setUp() {
57 formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
58 }
59
60
61 @Test
62 public void testBackslashWithEscaping() throws IOException {
63
64
65
66 final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
67 final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
68 assertTrue(format.isEscapeCharacterSet());
69 try (final Lexer parser = createLexer(code, format)) {
70 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
71 assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
72 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
73 assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
74 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
75 assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
76 assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
77 }
78 }
79
80
81 @Test
82 public void testBackslashWithoutEscaping() throws IOException {
83
84
85
86 final String code = "a,\\,,b\\\n\\,,";
87 final CSVFormat format = CSVFormat.DEFAULT;
88 assertFalse(format.isEscapeCharacterSet());
89 try (final Lexer parser = createLexer(code, format)) {
90 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
91
92 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
93 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
94 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
95
96 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
97 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
98 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
99 }
100 }
101
102 @Test
103 public void testBackspace() throws Exception {
104 try (final Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
105 assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
106 }
107 }
108
109 @Test
110 public void testComments() throws IOException {
111 final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
112 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
113 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
114 try (final Lexer parser = createLexer(code, format)) {
115 assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
116 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
117 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
118 assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
119 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
120 assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
121 assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
122 assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
123 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
124 assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
125 assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
126 assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
127 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
128 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
129 }
130 }
131
132 @Test
133 public void testCommentsAndEmptyLines() throws IOException {
134 final String code = "1,2,3,\n" +
135 "\n" +
136 "\n" +
137 "a,b x,c#no-comment\n" +
138 "#foo\n" +
139 "\n" +
140 "\n" +
141 "d,e,#no-comment\n" +
142 "\n" +
143 "\n" +
144 "# penultimate comment\n" +
145 "\n" +
146 "\n" +
147 "# Final comment\n";
148 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
149 assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
150
151 try (final Lexer parser = createLexer(code, format)) {
152 assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
153 assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
154 assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
155 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
156 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
157 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
158 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
159 assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
160 assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment"));
161 assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo"));
162 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
163 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
164 assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
165 assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
166 assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
167 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
168 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
169 assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
170 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
171 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
172 assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
173 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
174 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
175 }
176 }
177
178 @Test
179 public void testCR() throws Exception {
180 try (final Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
181 assertThat(lexer.nextToken(new Token()), hasContent("character"));
182 assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
183 }
184 }
185
186
187 @Test
188 public void testDelimiterIsWhitespace() throws IOException {
189 final String code = "one\ttwo\t\tfour \t five\t six";
190 try (final Lexer parser = createLexer(code, CSVFormat.TDF)) {
191 assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
192 assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
193 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
194 assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
195 assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
196 assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
197 }
198 }
199
200 @Test
201 public void testEscapedBackspace() throws Exception {
202 try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
203 assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
204 }
205 }
206
207 @Test
208 public void testEscapedCharacter() throws Exception {
209 try (final Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
210 assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
211 }
212 }
213
214 @Test
215 public void testEscapedControlCharacter() throws Exception {
216
217 try (final Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
218 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
219 }
220 }
221
222 @Test
223 public void testEscapedControlCharacter2() throws Exception {
224 try (final Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
225 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
226 }
227 }
228
229 @Test
230 public void testEscapedCR() throws Exception {
231 try (final Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
232 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
233 }
234 }
235
236 @Test
237 public void testEscapedFF() throws Exception {
238 try (final Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
239 assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
240 }
241 }
242
243 @Test
244 public void testEscapedLF() throws Exception {
245 try (final Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
246 assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
247 }
248 }
249
250 @Test
251 public void testEscapedMySqlNullValue() throws Exception {
252
253 try (final Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
254 assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
255 }
256 }
257
258 @Test
259 public void testEscapedTab() throws Exception {
260 try (final Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
261 assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
262 }
263
264 }
265
266 @Test
267 public void testEscapingAtEOF() throws Exception {
268 final String code = "escaping at EOF is evil\\";
269 try (final Lexer lexer = createLexer(code, formatWithEscaping)) {
270 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
271 }
272 }
273
274 @Test
275 public void testFF() throws Exception {
276 try (final Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
277 assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
278 }
279 }
280
281 @Test
282 public void testIgnoreEmptyLines() throws IOException {
283 final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
284 "\n" + "last, line \n" + "\n" + "\n" + "\n";
285 final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
286 try (final Lexer parser = createLexer(code, format)) {
287 assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
288 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
289 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
290 assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
291 assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
292 assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
293 assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
294 assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
295 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
296 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
297 }
298 }
299
300 @Test
301 public void testIsMetaCharCommentStart() throws IOException {
302 try (final Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
303 final int ch = lexer.readEscape();
304 assertEquals('#', ch);
305 }
306 }
307
308 @Test
309 public void testLF() throws Exception {
310 try (final Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
311 assertThat(lexer.nextToken(new Token()), hasContent("character"));
312 assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
313 }
314 }
315
316
317 @Test
318 public void testNextToken4() throws IOException {
319
320
321
322 final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
323 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
324 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
325 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
326 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
327 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
328 assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
329 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
330 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
331 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
332 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
333 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
334 assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
335
336 assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
337 }
338 }
339
340
341 @Test
342 public void testNextToken5() throws IOException {
343 final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
344 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
345 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
346 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
347 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
348 assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n baar ,,,"));
349 assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
350 }
351 }
352
353
354 @Test
355 public void testNextToken6() throws IOException {
356
357
358
359 final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
360 final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
361 try (final Lexer parser = createLexer(code, format)) {
362 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
363 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
364 }
365 }
366
367 @Test
368 public void testReadEscapeBackspace() throws IOException {
369 try (final Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
370 final int ch = lexer.readEscape();
371 assertEquals(BACKSPACE, ch);
372 }
373 }
374
375 @Test
376 public void testReadEscapeFF() throws IOException {
377 try (final Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
378 final int ch = lexer.readEscape();
379 assertEquals(FF, ch);
380 }
381 }
382
383 @Test
384 public void testReadEscapeTab() throws IOException {
385 try (final Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
386 final int ch = lexer.readEscape();
387 assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
388 assertEquals(TAB, ch);
389 }
390 }
391
392 @Test
393 public void testSurroundingSpacesAreDeleted() throws IOException {
394 final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
395 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
396 assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
397 assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
398 assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
399 assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
400 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
401 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
402 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
403 }
404 }
405
406 @Test
407 public void testSurroundingTabsAreDeleted() throws IOException {
408 final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
409 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
410 assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
411 assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
412 assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
413 assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
414 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
415 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
416 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
417 }
418 }
419
420 @Test
421 public void testTab() throws Exception {
422 try (final Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
423 assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
424 }
425 }
426
427 @Test
428 public void testTrimTrailingSpacesZeroLength() throws Exception {
429 final StringBuilder buffer = new StringBuilder("");
430 final Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT);
431 lexer.trimTrailingSpaces(buffer);
432 assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
433 }
434 }