View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.csv.Constants.BACKSPACE;
23  import static org.apache.commons.csv.Constants.CR;
24  import static org.apache.commons.csv.Constants.FF;
25  import static org.apache.commons.csv.Constants.LF;
26  import static org.apache.commons.csv.Constants.TAB;
27  import static org.apache.commons.csv.Token.Type.COMMENT;
28  import static org.apache.commons.csv.Token.Type.EOF;
29  import static org.apache.commons.csv.Token.Type.EORECORD;
30  import static org.apache.commons.csv.Token.Type.TOKEN;
31  import static org.junit.jupiter.api.Assertions.assertEquals;
32  import static org.junit.jupiter.api.Assertions.assertFalse;
33  import static org.junit.jupiter.api.Assertions.assertThrows;
34  import static org.junit.jupiter.api.Assertions.assertTrue;
35  
36  import java.io.IOException;
37  import java.io.StringReader;
38  
39  import org.junit.jupiter.api.BeforeEach;
40  import org.junit.jupiter.api.Test;
41  
42  /**
43   */
44  class LexerTest {
45  
46      private static void assertContent(final String expectedContent, final Token actualToken) {
47          assertEquals(expectedContent, actualToken.content.toString());
48      }
49  
50      private static void assertNextToken(final String expectedContent, final Lexer lexer) throws IOException {
51          assertContent(expectedContent, lexer.nextToken(new Token()));
52      }
53  
54      private static void assertNextToken(final Token.Type expectedType, final String expectedContent, final Lexer lexer) throws IOException {
55          final Token actualToken = lexer.nextToken(new Token());
56          assertEquals(expectedType, actualToken.type);
57          assertContent(expectedContent, actualToken);
58      }
59  
60      private CSVFormat formatWithEscaping;
61  
62      @SuppressWarnings("resource")
63      private Lexer createLexer(final String input, final CSVFormat format) {
64          return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
65      }
66  
67      @BeforeEach
68      public void setUp() {
69          formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
70      }
71  
72      // simple token with escaping enabled
73      @Test
74      void testBackslashWithEscaping() throws IOException {
75          /*
76           * file: a,\,,b \,,
77           */
78          final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
79          final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
80          assertTrue(format.isEscapeCharacterSet());
81          try (Lexer lexer = createLexer(code, format)) {
82              assertNextToken(TOKEN, "a", lexer);
83              assertNextToken(TOKEN, ",", lexer);
84              assertNextToken(EORECORD, "b\\", lexer);
85              assertNextToken(TOKEN, ",", lexer);
86              assertNextToken(TOKEN, "\nc", lexer);
87              assertNextToken(EORECORD, "d\r", lexer);
88              assertNextToken(EOF, "e", lexer);
89          }
90      }
91  
92      // simple token with escaping not enabled
93      @Test
94      void testBackslashWithoutEscaping() throws IOException {
95          /*
96           * file: a,\,,b \,,
97           */
98          final String code = "a,\\,,b\\\n\\,,";
99          final CSVFormat format = CSVFormat.DEFAULT;
100         assertFalse(format.isEscapeCharacterSet());
101         try (Lexer lexer = createLexer(code, format)) {
102             // parser.nextToken(new Token())
103             assertNextToken(TOKEN, "a", lexer);
104             // an unquoted single backslash is not an escape char
105             assertNextToken(TOKEN, "\\", lexer);
106             assertNextToken(TOKEN, "", lexer);
107             assertNextToken(EORECORD, "b\\", lexer);
108             // an unquoted single backslash is not an escape char
109             assertNextToken(TOKEN, "\\", lexer);
110             assertNextToken(TOKEN, "", lexer);
111             assertNextToken(EOF, "", lexer);
112         }
113     }
114 
115     @Test
116     void testBackspace() throws Exception {
117         try (Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
118             assertNextToken("character" + BACKSPACE + "NotEscaped", lexer);
119         }
120     }
121 
122     @Test
123     void testComments() throws IOException {
124         // @formatter:off
125         final String code = "first,line,\n" +
126                 "second,line,tokenWith#no-comment\n" +
127                 "# comment line \n" +
128                 "third,line,#no-comment\n" +
129                 "# penultimate comment\n" +
130                 "# Final comment\n";
131         // @formatter:on
132         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
133         try (Lexer lexer = createLexer(code, format)) {
134             assertNextToken(TOKEN, "first", lexer);
135             assertNextToken(TOKEN, "line", lexer);
136             assertNextToken(EORECORD, "", lexer);
137             assertNextToken(TOKEN, "second", lexer);
138             assertNextToken(TOKEN, "line", lexer);
139             assertNextToken(EORECORD, "tokenWith#no-comment", lexer);
140             assertNextToken(COMMENT, "comment line", lexer);
141             assertNextToken(TOKEN, "third", lexer);
142             assertNextToken(TOKEN, "line", lexer);
143             assertNextToken(EORECORD, "#no-comment", lexer);
144             assertNextToken(COMMENT, "penultimate comment", lexer);
145             assertNextToken(COMMENT, "Final comment", lexer);
146             assertNextToken(EOF, "", lexer);
147             assertNextToken(EOF, "", lexer);
148         }
149     }
150 
151     @Test
152     void testCommentsAndEmptyLines() throws IOException {
153         final String code = "1,2,3,\n" + // 1
154                 "\n" + // 1b
155                 "\n" + // 1c
156                 "a,b x,c#no-comment\n" + // 2
157                 "#foo\n" + // 3
158                 "\n" + // 4
159                 "\n" + // 4b
160                 "d,e,#no-comment\n" + // 5
161                 "\n" + // 5b
162                 "\n" + // 5c
163                 "# penultimate comment\n" + // 6
164                 "\n" + // 6b
165                 "\n" + // 6c
166                 "# Final comment\n"; // 7
167         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
168         assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
169 
170         try (Lexer lexer = createLexer(code, format)) {
171             assertNextToken(TOKEN, "1", lexer);
172             assertNextToken(TOKEN, "2", lexer);
173             assertNextToken(TOKEN, "3", lexer);
174             assertNextToken(EORECORD, "", lexer); // 1
175             assertNextToken(EORECORD, "", lexer); // 1b
176             assertNextToken(EORECORD, "", lexer); // 1c
177             assertNextToken(TOKEN, "a", lexer);
178             assertNextToken(TOKEN, "b x", lexer);
179             assertNextToken(EORECORD, "c#no-comment", lexer); // 2
180             assertNextToken(COMMENT, "foo", lexer); // 3
181             assertNextToken(EORECORD, "", lexer); // 4
182             assertNextToken(EORECORD, "", lexer); // 4b
183             assertNextToken(TOKEN, "d", lexer);
184             assertNextToken(TOKEN, "e", lexer);
185             assertNextToken(EORECORD, "#no-comment", lexer); // 5
186             assertNextToken(EORECORD, "", lexer); // 5b
187             assertNextToken(EORECORD, "", lexer); // 5c
188             assertNextToken(COMMENT, "penultimate comment", lexer); // 6
189             assertNextToken(EORECORD, "", lexer); // 6b
190             assertNextToken(EORECORD, "", lexer); // 6c
191             assertNextToken(COMMENT, "Final comment", lexer); // 7
192             assertNextToken(EOF, "", lexer);
193             assertNextToken(EOF, "", lexer);
194         }
195     }
196 
197     @Test
198     void testCR() throws Exception {
199         try (Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
200             assertNextToken("character", lexer);
201             assertNextToken("NotEscaped", lexer);
202         }
203     }
204 
205     // From CSV-1
206     @Test
207     void testDelimiterIsWhitespace() throws IOException {
208         final String code = "one\ttwo\t\tfour \t five\t six";
209         try (Lexer lexer = createLexer(code, CSVFormat.TDF)) {
210             assertNextToken(TOKEN, "one", lexer);
211             assertNextToken(TOKEN, "two", lexer);
212             assertNextToken(TOKEN, "", lexer);
213             assertNextToken(TOKEN, "four", lexer);
214             assertNextToken(TOKEN, "five", lexer);
215             assertNextToken(EOF, "six", lexer);
216         }
217     }
218 
219     @Test
220     void testEOFWithoutClosingQuote() throws Exception {
221         final String code = "a,\"b";
222         try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).get())) {
223             assertNextToken(TOKEN, "a", lexer);
224             assertNextToken(EOF, "b", lexer);
225         }
226         try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).get())) {
227             assertNextToken(TOKEN, "a", lexer);
228             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
229         }
230     }
231 
232     @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
233     void testEscapedBackspace() throws Exception {
234         try (Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
235             assertNextToken("character" + BACKSPACE + "Escaped", lexer);
236         }
237     }
238 
239     @Test
240     void testEscapedCharacter() throws Exception {
241         try (Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
242             assertNextToken("character\\aEscaped", lexer);
243         }
244     }
245 
246     @Test
247     void testEscapedControlCharacter() throws Exception {
248         // we are explicitly using an escape different from \ here
249         try (Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
250             assertNextToken("character" + CR + "Escaped", lexer);
251         }
252     }
253 
254     @Test
255     void testEscapedControlCharacter2() throws Exception {
256         try (Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
257             assertNextToken("character" + CR + "Escaped", lexer);
258         }
259     }
260 
261     @Test
262     void testEscapedCR() throws Exception {
263         try (Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
264             assertNextToken("character" + CR + "Escaped", lexer);
265         }
266     }
267 
268     @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
269     void testEscapedFF() throws Exception {
270         try (Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
271             assertNextToken("character" + FF + "Escaped", lexer);
272         }
273     }
274 
275     @Test
276     void testEscapedLF() throws Exception {
277         try (Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
278             assertNextToken("character" + LF + "Escaped", lexer);
279         }
280     }
281 
282     @Test
283     void testEscapedMySqlNullValue() throws Exception {
284         // MySQL uses \N to symbolize null values. We have to restore this
285         try (Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
286             assertNextToken("character\\NEscaped", lexer);
287         }
288     }
289 
290     @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
291     void testEscapedTab() throws Exception {
292         try (Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
293             assertNextToken("character" + TAB + "Escaped", lexer);
294         }
295 
296     }
297 
298     @Test
299     void testEscapingAtEOF() throws Exception {
300         final String code = "escaping at EOF is evil\\";
301         try (Lexer lexer = createLexer(code, formatWithEscaping)) {
302             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
303         }
304     }
305 
306     @Test
307     void testFF() throws Exception {
308         try (Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
309             assertNextToken("character" + FF + "NotEscaped", lexer);
310         }
311     }
312 
313     @Test
314     void testIgnoreEmptyLines() throws IOException {
315         // @formatter:off
316         final String code = "first,line,\n" +
317                 "\n" +
318                 "\n" +
319                 "second,line\n" +
320                 "\n" +
321                 "\n" +
322                 "third line \n" +
323                 "\n" +
324                 "\n" +
325                 "last, line \n" +
326                 "\n" +
327                 "\n" +
328                 "\n";
329         // @formatter:on
330         final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
331         try (Lexer lexer = createLexer(code, format)) {
332             assertNextToken(TOKEN, "first", lexer);
333             assertNextToken(TOKEN, "line", lexer);
334             assertNextToken(EORECORD, "", lexer);
335             assertNextToken(TOKEN, "second", lexer);
336             assertNextToken(EORECORD, "line", lexer);
337             assertNextToken(EORECORD, "third line ", lexer);
338             assertNextToken(TOKEN, "last", lexer);
339             assertNextToken(EORECORD, " line ", lexer);
340             assertNextToken(EOF, "", lexer);
341             assertNextToken(EOF, "", lexer);
342         }
343     }
344 
345     @Test
346     void testIsMetaCharCommentStart() throws IOException {
347         try (Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
348             final int ch = lexer.readEscape();
349             assertEquals('#', ch);
350         }
351     }
352 
353     @Test
354     void testLF() throws Exception {
355         try (Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
356             assertNextToken("character", lexer);
357             assertNextToken("NotEscaped", lexer);
358         }
359     }
360 
361     // encapsulator tokenizer (single line)
362     @Test
363     void testNextToken4() throws IOException {
364         /*
365          * file: a,"foo",b a, " foo",b a,"foo " ,b // whitespace after closing encapsulator a, " foo " ,b
366          */
367         final String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
368         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
369             assertNextToken(TOKEN, "a", lexer);
370             assertNextToken(TOKEN, "foo", lexer);
371             assertNextToken(EORECORD, "b", lexer);
372             assertNextToken(TOKEN, "a", lexer);
373             assertNextToken(TOKEN, " foo", lexer);
374             assertNextToken(EORECORD, "b", lexer);
375             assertNextToken(TOKEN, "a", lexer);
376             assertNextToken(TOKEN, "foo ", lexer);
377             assertNextToken(EORECORD, "b", lexer);
378             assertNextToken(TOKEN, "a", lexer);
379             assertNextToken(TOKEN, " foo ", lexer);
380             // assertTokenEquals(EORECORD, "b", parser);
381             assertNextToken(EOF, "b", lexer);
382         }
383     }
384 
385     // encapsulator tokenizer (multi line, delimiter in string)
386     @Test
387     void testNextToken5() throws IOException {
388         final String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
389         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT)) {
390             assertNextToken(TOKEN, "a", lexer);
391             assertNextToken(TOKEN, "foo\n", lexer);
392             assertNextToken(EORECORD, "b", lexer);
393             assertNextToken(EORECORD, "foo\n  baar ,,,", lexer);
394             assertNextToken(EOF, "\n\t \n", lexer);
395         }
396     }
397 
398     // change delimiters, comment, encapsulater
399     @Test
400     void testNextToken6() throws IOException {
401         /*
402          * file: a;'b and \' more ' !comment;;;; ;;
403          */
404         final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
405         final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
406         try (Lexer lexer = createLexer(code, format)) {
407             assertNextToken(TOKEN, "a", lexer);
408             assertNextToken(EORECORD, "b and ' more\n", lexer);
409         }
410     }
411 
412     @Test
413     void testReadEscapeBackspace() throws IOException {
414         try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
415             final int ch = lexer.readEscape();
416             assertEquals(BACKSPACE, ch);
417         }
418     }
419 
420     @Test
421     void testReadEscapeFF() throws IOException {
422         try (Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
423             final int ch = lexer.readEscape();
424             assertEquals(FF, ch);
425         }
426     }
427 
428     @Test
429     void testReadEscapeTab() throws IOException {
430         try (Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
431             final int ch = lexer.readEscape();
432             assertNextToken(EOF, "", lexer);
433             assertEquals(TAB, ch);
434         }
435     }
436 
437     @Test
438     void testSurroundingSpacesAreDeleted() throws IOException {
439         final String code = "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,,";
440         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
441             assertNextToken(TOKEN, "noSpaces", lexer);
442             assertNextToken(TOKEN, "leadingSpaces", lexer);
443             assertNextToken(TOKEN, "trailingSpaces", lexer);
444             assertNextToken(TOKEN, "surroundingSpaces", lexer);
445             assertNextToken(TOKEN, "", lexer);
446             assertNextToken(TOKEN, "", lexer);
447             assertNextToken(EOF, "", lexer);
448         }
449     }
450 
451     @Test
452     void testSurroundingTabsAreDeleted() throws IOException {
453         final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
454         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
455             assertNextToken(TOKEN, "noTabs", lexer);
456             assertNextToken(TOKEN, "leadingTab", lexer);
457             assertNextToken(TOKEN, "trailingTab", lexer);
458             assertNextToken(TOKEN, "surroundingTabs", lexer);
459             assertNextToken(TOKEN, "", lexer);
460             assertNextToken(TOKEN, "", lexer);
461             assertNextToken(EOF, "", lexer);
462         }
463     }
464 
465     @Test
466     void testTab() throws Exception {
467         try (Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
468             assertNextToken("character" + TAB + "NotEscaped", lexer);
469         }
470     }
471 
472     @Test
473     void testTrailingTextAfterQuote() throws Exception {
474         final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
475         try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).get())) {
476             assertNextToken(TOKEN, "a b", lexer);
477             assertNextToken(TOKEN, "a \" b", lexer);
478             assertNextToken(EOF, "a b \"\"", lexer);
479         }
480         try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).get())) {
481             assertThrows(IOException.class, () -> parser.nextToken(new Token()));
482         }
483     }
484 
485     @Test
486     void testTrimTrailingSpacesZeroLength() throws Exception {
487         final StringBuilder buffer = new StringBuilder("");
488         try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
489             lexer.trimTrailingSpaces(buffer);
490             assertNextToken(EOF, "", lexer);
491         }
492     }
493 }