View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.csv.Constants.BACKSPACE;
23  import static org.apache.commons.csv.Constants.CR;
24  import static org.apache.commons.csv.Constants.FF;
25  import static org.apache.commons.csv.Constants.LF;
26  import static org.apache.commons.csv.Constants.TAB;
27  import static org.apache.commons.csv.Token.Type.COMMENT;
28  import static org.apache.commons.csv.Token.Type.EOF;
29  import static org.apache.commons.csv.Token.Type.EORECORD;
30  import static org.apache.commons.csv.Token.Type.TOKEN;
31  import static org.junit.jupiter.api.Assertions.assertEquals;
32  import static org.junit.jupiter.api.Assertions.assertFalse;
33  import static org.junit.jupiter.api.Assertions.assertThrows;
34  import static org.junit.jupiter.api.Assertions.assertTrue;
35  
36  import java.io.IOException;
37  import java.io.StringReader;
38  
39  import org.junit.jupiter.api.BeforeEach;
40  import org.junit.jupiter.api.Test;
41  
42  /**
43   */
44  public class LexerTest {
45  
46      private static void assertContent(final String expectedContent, final Token actualToken) {
47          assertEquals(expectedContent, actualToken.content.toString());
48      }
49  
50      private static void assertNextToken(final String expectedContent, final Lexer lexer) throws IOException {
51          assertContent(expectedContent, lexer.nextToken(new Token()));
52      }
53  
54      private static void assertNextToken(final Token.Type expectedType, final String expectedContent, final Lexer lexer) throws IOException {
55          final Token actualToken = lexer.nextToken(new Token());
56          assertEquals(expectedType, actualToken.type);
57          assertContent(expectedContent, actualToken);
58      }
59  
60      private CSVFormat formatWithEscaping;
61  
62      @SuppressWarnings("resource")
63      private Lexer createLexer(final String input, final CSVFormat format) {
64          return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
65      }
66  
67      @BeforeEach
68      public void setUp() {
69          formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
70      }
71  
72      // simple token with escaping enabled
73      @Test
74      public void testBackslashWithEscaping() throws IOException {
75          /*
76           * file: a,\,,b \,,
77           */
78          final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
79          final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
80          assertTrue(format.isEscapeCharacterSet());
81          try (Lexer lexer = createLexer(code, format)) {
82              assertNextToken(TOKEN, "a", lexer);
83              assertNextToken(TOKEN, ",", lexer);
84              assertNextToken(EORECORD, "b\\", lexer);
85              assertNextToken(TOKEN, ",", lexer);
86              assertNextToken(TOKEN, "\nc", lexer);
87              assertNextToken(EORECORD, "d\r", lexer);
88              assertNextToken(EOF, "e", lexer);
89          }
90      }
91  
92      // simple token with escaping not enabled
93      @Test
94      public void testBackslashWithoutEscaping() throws IOException {
95          /*
96           * file: a,\,,b \,,
97           */
98          final String code = "a,\\,,b\\\n\\,,";
99          final CSVFormat format = CSVFormat.DEFAULT;
100         assertFalse(format.isEscapeCharacterSet());
101         try (Lexer lexer = createLexer(code, format)) {
102             // parser.nextToken(new Token())
103             assertNextToken(TOKEN, "a", lexer);
104             // an unquoted single backslash is not an escape char
105             assertNextToken(TOKEN, "\\", lexer);
106             assertNextToken(TOKEN, "", lexer);
107             assertNextToken(EORECORD, "b\\", lexer);
108             // an unquoted single backslash is not an escape char
109             assertNextToken(TOKEN, "\\", lexer);
110             assertNextToken(TOKEN, "", lexer);
111             assertNextToken(EOF, "", lexer);
112         }
113     }
114 
115     @Test
116     public void testBackspace() throws Exception {
117         try (Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
118             assertNextToken("character" + BACKSPACE + "NotEscaped", lexer);
119         }
120     }
121 
122     @Test
123     public void testComments() throws IOException {
124         final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
125                 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
126         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
127         try (Lexer lexer = createLexer(code, format)) {
128             assertNextToken(TOKEN, "first", lexer);
129             assertNextToken(TOKEN, "line", lexer);
130             assertNextToken(EORECORD, "", lexer);
131             assertNextToken(TOKEN, "second", lexer);
132             assertNextToken(TOKEN, "line", lexer);
133             assertNextToken(EORECORD, "tokenWith#no-comment", lexer);
134             assertNextToken(COMMENT, "comment line", lexer);
135             assertNextToken(TOKEN, "third", lexer);
136             assertNextToken(TOKEN, "line", lexer);
137             assertNextToken(EORECORD, "#no-comment", lexer);
138             assertNextToken(COMMENT, "penultimate comment", lexer);
139             assertNextToken(COMMENT, "Final comment", lexer);
140             assertNextToken(EOF, "", lexer);
141             assertNextToken(EOF, "", lexer);
142         }
143     }
144 
145     @Test
146     public void testCommentsAndEmptyLines() throws IOException {
147         final String code = "1,2,3,\n" + // 1
148                 "\n" + // 1b
149                 "\n" + // 1c
150                 "a,b x,c#no-comment\n" + // 2
151                 "#foo\n" + // 3
152                 "\n" + // 4
153                 "\n" + // 4b
154                 "d,e,#no-comment\n" + // 5
155                 "\n" + // 5b
156                 "\n" + // 5c
157                 "# penultimate comment\n" + // 6
158                 "\n" + // 6b
159                 "\n" + // 6c
160                 "# Final comment\n"; // 7
161         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
162         assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
163 
164         try (Lexer lexer = createLexer(code, format)) {
165             assertNextToken(TOKEN, "1", lexer);
166             assertNextToken(TOKEN, "2", lexer);
167             assertNextToken(TOKEN, "3", lexer);
168             assertNextToken(EORECORD, "", lexer); // 1
169             assertNextToken(EORECORD, "", lexer); // 1b
170             assertNextToken(EORECORD, "", lexer); // 1c
171             assertNextToken(TOKEN, "a", lexer);
172             assertNextToken(TOKEN, "b x", lexer);
173             assertNextToken(EORECORD, "c#no-comment", lexer); // 2
174             assertNextToken(COMMENT, "foo", lexer); // 3
175             assertNextToken(EORECORD, "", lexer); // 4
176             assertNextToken(EORECORD, "", lexer); // 4b
177             assertNextToken(TOKEN, "d", lexer);
178             assertNextToken(TOKEN, "e", lexer);
179             assertNextToken(EORECORD, "#no-comment", lexer); // 5
180             assertNextToken(EORECORD, "", lexer); // 5b
181             assertNextToken(EORECORD, "", lexer); // 5c
182             assertNextToken(COMMENT, "penultimate comment", lexer); // 6
183             assertNextToken(EORECORD, "", lexer); // 6b
184             assertNextToken(EORECORD, "", lexer); // 6c
185             assertNextToken(COMMENT, "Final comment", lexer); // 7
186             assertNextToken(EOF, "", lexer);
187             assertNextToken(EOF, "", lexer);
188         }
189     }
190 
191     @Test
192     public void testCR() throws Exception {
193         try (Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
194             assertNextToken("character", lexer);
195             assertNextToken("NotEscaped", lexer);
196         }
197     }
198 
199     // From CSV-1
200     @Test
201     public void testDelimiterIsWhitespace() throws IOException {
202         final String code = "one\ttwo\t\tfour \t five\t six";
203         try (Lexer lexer = createLexer(code, CSVFormat.TDF)) {
204             assertNextToken(TOKEN, "one", lexer);
205             assertNextToken(TOKEN, "two", lexer);
206             assertNextToken(TOKEN, "", lexer);
207             assertNextToken(TOKEN, "four", lexer);
208             assertNextToken(TOKEN, "five", lexer);
209             assertNextToken(EOF, "six", lexer);
210         }
211     }
212 
213     @Test
214     public void testEOFWithoutClosingQuote() throws Exception {
215         final String code = "a,\"b";
216         try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).get())) {
217             assertNextToken(TOKEN, "a", lexer);
218             assertNextToken(EOF, "b", lexer);
219         }
220         try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).get())) {
221             assertNextToken(TOKEN, "a", lexer);
222             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
223         }
224     }
225 
226     @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
227     public void testEscapedBackspace() throws Exception {
228         try (Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
229             assertNextToken("character" + BACKSPACE + "Escaped", lexer);
230         }
231     }
232 
233     @Test
234     public void testEscapedCharacter() throws Exception {
235         try (Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
236             assertNextToken("character\\aEscaped", lexer);
237         }
238     }
239 
240     @Test
241     public void testEscapedControlCharacter() throws Exception {
242         // we are explicitly using an escape different from \ here
243         try (Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
244             assertNextToken("character" + CR + "Escaped", lexer);
245         }
246     }
247 
248     @Test
249     public void testEscapedControlCharacter2() throws Exception {
250         try (Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
251             assertNextToken("character" + CR + "Escaped", lexer);
252         }
253     }
254 
255     @Test
256     public void testEscapedCR() throws Exception {
257         try (Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
258             assertNextToken("character" + CR + "Escaped", lexer);
259         }
260     }
261 
262     @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
263     public void testEscapedFF() throws Exception {
264         try (Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
265             assertNextToken("character" + FF + "Escaped", lexer);
266         }
267     }
268 
269     @Test
270     public void testEscapedLF() throws Exception {
271         try (Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
272             assertNextToken("character" + LF + "Escaped", lexer);
273         }
274     }
275 
276     @Test
277     public void testEscapedMySqlNullValue() throws Exception {
278         // MySQL uses \N to symbolize null values. We have to restore this
279         try (Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
280             assertNextToken("character\\NEscaped", lexer);
281         }
282     }
283 
284     @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
285     public void testEscapedTab() throws Exception {
286         try (Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
287             assertNextToken("character" + TAB + "Escaped", lexer);
288         }
289 
290     }
291 
292     @Test
293     public void testEscapingAtEOF() throws Exception {
294         final String code = "escaping at EOF is evil\\";
295         try (Lexer lexer = createLexer(code, formatWithEscaping)) {
296             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
297         }
298     }
299 
300     @Test
301     public void testFF() throws Exception {
302         try (Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
303             assertNextToken("character" + FF + "NotEscaped", lexer);
304         }
305     }
306 
307     @Test
308     public void testIgnoreEmptyLines() throws IOException {
309         final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
310                 "\n" + "last, line \n" + "\n" + "\n" + "\n";
311         final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
312         try (Lexer lexer = createLexer(code, format)) {
313             assertNextToken(TOKEN, "first", lexer);
314             assertNextToken(TOKEN, "line", lexer);
315             assertNextToken(EORECORD, "", lexer);
316             assertNextToken(TOKEN, "second", lexer);
317             assertNextToken(EORECORD, "line", lexer);
318             assertNextToken(EORECORD, "third line ", lexer);
319             assertNextToken(TOKEN, "last", lexer);
320             assertNextToken(EORECORD, " line ", lexer);
321             assertNextToken(EOF, "", lexer);
322             assertNextToken(EOF, "", lexer);
323         }
324     }
325 
326     @Test
327     public void testIsMetaCharCommentStart() throws IOException {
328         try (Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
329             final int ch = lexer.readEscape();
330             assertEquals('#', ch);
331         }
332     }
333 
334     @Test
335     public void testLF() throws Exception {
336         try (Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
337             assertNextToken("character", lexer);
338             assertNextToken("NotEscaped", lexer);
339         }
340     }
341 
342     // encapsulator tokenizer (single line)
343     @Test
344     public void testNextToken4() throws IOException {
345         /*
346          * file: a,"foo",b a, " foo",b a,"foo " ,b // whitespace after closing encapsulator a, " foo " ,b
347          */
348         final String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
349         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
350             assertNextToken(TOKEN, "a", lexer);
351             assertNextToken(TOKEN, "foo", lexer);
352             assertNextToken(EORECORD, "b", lexer);
353             assertNextToken(TOKEN, "a", lexer);
354             assertNextToken(TOKEN, " foo", lexer);
355             assertNextToken(EORECORD, "b", lexer);
356             assertNextToken(TOKEN, "a", lexer);
357             assertNextToken(TOKEN, "foo ", lexer);
358             assertNextToken(EORECORD, "b", lexer);
359             assertNextToken(TOKEN, "a", lexer);
360             assertNextToken(TOKEN, " foo ", lexer);
361             // assertTokenEquals(EORECORD, "b", parser);
362             assertNextToken(EOF, "b", lexer);
363         }
364     }
365 
366     // encapsulator tokenizer (multi line, delimiter in string)
367     @Test
368     public void testNextToken5() throws IOException {
369         final String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
370         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT)) {
371             assertNextToken(TOKEN, "a", lexer);
372             assertNextToken(TOKEN, "foo\n", lexer);
373             assertNextToken(EORECORD, "b", lexer);
374             assertNextToken(EORECORD, "foo\n  baar ,,,", lexer);
375             assertNextToken(EOF, "\n\t \n", lexer);
376         }
377     }
378 
379     // change delimiters, comment, encapsulater
380     @Test
381     public void testNextToken6() throws IOException {
382         /*
383          * file: a;'b and \' more ' !comment;;;; ;;
384          */
385         final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
386         final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
387         try (Lexer lexer = createLexer(code, format)) {
388             assertNextToken(TOKEN, "a", lexer);
389             assertNextToken(EORECORD, "b and ' more\n", lexer);
390         }
391     }
392 
393     @Test
394     public void testReadEscapeBackspace() throws IOException {
395         try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
396             final int ch = lexer.readEscape();
397             assertEquals(BACKSPACE, ch);
398         }
399     }
400 
401     @Test
402     public void testReadEscapeFF() throws IOException {
403         try (Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
404             final int ch = lexer.readEscape();
405             assertEquals(FF, ch);
406         }
407     }
408 
409     @Test
410     public void testReadEscapeTab() throws IOException {
411         try (Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
412             final int ch = lexer.readEscape();
413             assertNextToken(EOF, "", lexer);
414             assertEquals(TAB, ch);
415         }
416     }
417 
418     @Test
419     public void testSurroundingSpacesAreDeleted() throws IOException {
420         final String code = "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,,";
421         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
422             assertNextToken(TOKEN, "noSpaces", lexer);
423             assertNextToken(TOKEN, "leadingSpaces", lexer);
424             assertNextToken(TOKEN, "trailingSpaces", lexer);
425             assertNextToken(TOKEN, "surroundingSpaces", lexer);
426             assertNextToken(TOKEN, "", lexer);
427             assertNextToken(TOKEN, "", lexer);
428             assertNextToken(EOF, "", lexer);
429         }
430     }
431 
432     @Test
433     public void testSurroundingTabsAreDeleted() throws IOException {
434         final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
435         try (Lexer lexer = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
436             assertNextToken(TOKEN, "noTabs", lexer);
437             assertNextToken(TOKEN, "leadingTab", lexer);
438             assertNextToken(TOKEN, "trailingTab", lexer);
439             assertNextToken(TOKEN, "surroundingTabs", lexer);
440             assertNextToken(TOKEN, "", lexer);
441             assertNextToken(TOKEN, "", lexer);
442             assertNextToken(EOF, "", lexer);
443         }
444     }
445 
446     @Test
447     public void testTab() throws Exception {
448         try (Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
449             assertNextToken("character" + TAB + "NotEscaped", lexer);
450         }
451     }
452 
453     @Test
454     public void testTrailingTextAfterQuote() throws Exception {
455         final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
456         try (Lexer lexer = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).get())) {
457             assertNextToken(TOKEN, "a b", lexer);
458             assertNextToken(TOKEN, "a \" b", lexer);
459             assertNextToken(EOF, "a b \"\"", lexer);
460         }
461         try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).get())) {
462             assertThrows(IOException.class, () -> parser.nextToken(new Token()));
463         }
464     }
465 
466     @Test
467     public void testTrimTrailingSpacesZeroLength() throws Exception {
468         final StringBuilder buffer = new StringBuilder("");
469         try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
470             lexer.trimTrailingSpaces(buffer);
471             assertNextToken(EOF, "", lexer);
472         }
473     }
474 }