View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.csv.Constants.BACKSPACE;
23  import static org.apache.commons.csv.Constants.CR;
24  import static org.apache.commons.csv.Constants.FF;
25  import static org.apache.commons.csv.Constants.LF;
26  import static org.apache.commons.csv.Constants.TAB;
27  import static org.apache.commons.csv.Token.Type.COMMENT;
28  import static org.apache.commons.csv.Token.Type.EOF;
29  import static org.apache.commons.csv.Token.Type.EORECORD;
30  import static org.apache.commons.csv.Token.Type.TOKEN;
31  import static org.apache.commons.csv.TokenMatchers.hasContent;
32  import static org.apache.commons.csv.TokenMatchers.matches;
33  import static org.hamcrest.MatcherAssert.assertThat;
34  import static org.junit.jupiter.api.Assertions.assertEquals;
35  import static org.junit.jupiter.api.Assertions.assertFalse;
36  import static org.junit.jupiter.api.Assertions.assertThrows;
37  import static org.junit.jupiter.api.Assertions.assertTrue;
38  
39  import java.io.IOException;
40  import java.io.StringReader;
41  
42  import org.junit.jupiter.api.BeforeEach;
43  import org.junit.jupiter.api.Test;
44  
45  /**
46   */
47  public class LexerTest {
48  
49      private CSVFormat formatWithEscaping;
50  
51      @SuppressWarnings("resource")
52      private Lexer createLexer(final String input, final CSVFormat format) {
53          return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
54      }
55  
56      @BeforeEach
57      public void setUp() {
58          formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
59      }
60  
61      // simple token with escaping enabled
62      @Test
63      public void testBackslashWithEscaping() throws IOException {
64          /*
65           * file: a,\,,b \,,
66           */
67          final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
68          final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
69          assertTrue(format.isEscapeCharacterSet());
70          try (Lexer parser = createLexer(code, format)) {
71              assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
72              assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
73              assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
74              assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
75              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
76              assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
77              assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
78          }
79      }
80  
81      // simple token with escaping not enabled
82      @Test
83      public void testBackslashWithoutEscaping() throws IOException {
84          /*
85           * file: a,\,,b \,,
86           */
87          final String code = "a,\\,,b\\\n\\,,";
88          final CSVFormat format = CSVFormat.DEFAULT;
89          assertFalse(format.isEscapeCharacterSet());
90          try (Lexer parser = createLexer(code, format)) {
91              assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
92              // an unquoted single backslash is not an escape char
93              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
94              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
95              assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
96              // an unquoted single backslash is not an escape char
97              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
98              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
99              assertThat(parser.nextToken(new Token()), matches(EOF, ""));
100         }
101     }
102 
103     @Test
104     public void testBackspace() throws Exception {
105         try (Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
106             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
107         }
108     }
109 
110     @Test
111     public void testComments() throws IOException {
112         final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
113                 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
114         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
115         try (Lexer parser = createLexer(code, format)) {
116             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
117             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
118             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
119             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
120             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
121             assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
122             assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
123             assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
124             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
125             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
126             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
127             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
128             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
129             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
130         }
131     }
132 
133     @Test
134     public void testCommentsAndEmptyLines() throws IOException {
135         final String code = "1,2,3,\n" + // 1
136                 "\n" + // 1b
137                 "\n" + // 1c
138                 "a,b x,c#no-comment\n" + // 2
139                 "#foo\n" + // 3
140                 "\n" + // 4
141                 "\n" + // 4b
142                 "d,e,#no-comment\n" + // 5
143                 "\n" + // 5b
144                 "\n" + // 5c
145                 "# penultimate comment\n" + // 6
146                 "\n" + // 6b
147                 "\n" + // 6c
148                 "# Final comment\n"; // 7
149         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
150         assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
151 
152         try (Lexer parser = createLexer(code, format)) {
153             assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
154             assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
155             assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
156             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1
157             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1b
158             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1c
159             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
160             assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
161             assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment")); // 2
162             assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo")); // 3
163             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4
164             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4b
165             assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
166             assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
167             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment")); // 5
168             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5b
169             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5c
170             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment")); // 6
171             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6b
172             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6c
173             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment")); // 7
174             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
175             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
176         }
177     }
178 
179     @Test
180     public void testCR() throws Exception {
181         try (Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
182             assertThat(lexer.nextToken(new Token()), hasContent("character"));
183             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
184         }
185     }
186 
187     // From CSV-1
188     @Test
189     public void testDelimiterIsWhitespace() throws IOException {
190         final String code = "one\ttwo\t\tfour \t five\t six";
191         try (Lexer parser = createLexer(code, CSVFormat.TDF)) {
192             assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
193             assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
194             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
195             assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
196             assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
197             assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
198         }
199     }
200 
201     @Test
202     public void testEOFWithoutClosingQuote() throws Exception {
203         final String code = "a,\"b";
204         try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).get())) {
205             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
206             assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
207         }
208         try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).get())) {
209             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
210             assertThrows(IOException.class, () -> parser.nextToken(new Token()));
211         }
212     }
213 
214     @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
215     public void testEscapedBackspace() throws Exception {
216         try (Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
217             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
218         }
219     }
220 
221     @Test
222     public void testEscapedCharacter() throws Exception {
223         try (Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
224             assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
225         }
226     }
227 
228     @Test
229     public void testEscapedControlCharacter() throws Exception {
230         // we are explicitly using an escape different from \ here
231         try (Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
232             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
233         }
234     }
235 
236     @Test
237     public void testEscapedControlCharacter2() throws Exception {
238         try (Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
239             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
240         }
241     }
242 
243     @Test
244     public void testEscapedCR() throws Exception {
245         try (Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
246             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
247         }
248     }
249 
250     @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
251     public void testEscapedFF() throws Exception {
252         try (Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
253             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
254         }
255     }
256 
257     @Test
258     public void testEscapedLF() throws Exception {
259         try (Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
260             assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
261         }
262     }
263 
264     @Test
265     public void testEscapedMySqlNullValue() throws Exception {
266         // MySQL uses \N to symbolize null values. We have to restore this
267         try (Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
268             assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
269         }
270     }
271 
272     @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
273     public void testEscapedTab() throws Exception {
274         try (Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
275             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
276         }
277 
278     }
279 
280     @Test
281     public void testEscapingAtEOF() throws Exception {
282         final String code = "escaping at EOF is evil\\";
283         try (Lexer lexer = createLexer(code, formatWithEscaping)) {
284             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
285         }
286     }
287 
288     @Test
289     public void testFF() throws Exception {
290         try (Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
291             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
292         }
293     }
294 
295     @Test
296     public void testIgnoreEmptyLines() throws IOException {
297         final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
298                 "\n" + "last, line \n" + "\n" + "\n" + "\n";
299         final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
300         try (Lexer parser = createLexer(code, format)) {
301             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
302             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
303             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
304             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
305             assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
306             assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
307             assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
308             assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
309             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
310             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
311         }
312     }
313 
314     @Test
315     public void testIsMetaCharCommentStart() throws IOException {
316         try (Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
317             final int ch = lexer.readEscape();
318             assertEquals('#', ch);
319         }
320     }
321 
322     @Test
323     public void testLF() throws Exception {
324         try (Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
325             assertThat(lexer.nextToken(new Token()), hasContent("character"));
326             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
327         }
328     }
329 
330     // encapsulator tokenizer (single line)
331     @Test
332     public void testNextToken4() throws IOException {
333         /*
334          * file: a,"foo",b a, " foo",b a,"foo " ,b // whitespace after closing encapsulator a, " foo " ,b
335          */
336         final String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
337         try (Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
338             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
339             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
340             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
341             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
342             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
343             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
344             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
345             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
346             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
347             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
348             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
349             // assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
350             assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
351         }
352     }
353 
354     // encapsulator tokenizer (multi line, delimiter in string)
355     @Test
356     public void testNextToken5() throws IOException {
357         final String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
358         try (Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
359             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
360             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
361             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
362             assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n  baar ,,,"));
363             assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
364         }
365     }
366 
367     // change delimiters, comment, encapsulater
368     @Test
369     public void testNextToken6() throws IOException {
370         /*
371          * file: a;'b and \' more ' !comment;;;; ;;
372          */
373         final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
374         final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
375         try (Lexer parser = createLexer(code, format)) {
376             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
377             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
378         }
379     }
380 
381     @Test
382     public void testReadEscapeBackspace() throws IOException {
383         try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
384             final int ch = lexer.readEscape();
385             assertEquals(BACKSPACE, ch);
386         }
387     }
388 
389     @Test
390     public void testReadEscapeFF() throws IOException {
391         try (Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
392             final int ch = lexer.readEscape();
393             assertEquals(FF, ch);
394         }
395     }
396 
397     @Test
398     public void testReadEscapeTab() throws IOException {
399         try (Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
400             final int ch = lexer.readEscape();
401             assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
402             assertEquals(TAB, ch);
403         }
404     }
405 
406     @Test
407     public void testSurroundingSpacesAreDeleted() throws IOException {
408         final String code = "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,,";
409         try (Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
410             assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
411             assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
412             assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
413             assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
414             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
415             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
416             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
417         }
418     }
419 
420     @Test
421     public void testSurroundingTabsAreDeleted() throws IOException {
422         final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
423         try (Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
424             assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
425             assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
426             assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
427             assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
428             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
429             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
430             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
431         }
432     }
433 
434     @Test
435     public void testTab() throws Exception {
436         try (Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
437             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
438         }
439     }
440 
441     @Test
442     public void testTrailingTextAfterQuote() throws Exception {
443         final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
444         try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).get())) {
445             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
446             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
447             assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
448         }
449         try (Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).get())) {
450             assertThrows(IOException.class, () -> parser.nextToken(new Token()));
451         }
452     }
453 
454     @Test
455     public void testTrimTrailingSpacesZeroLength() throws Exception {
456         final StringBuilder buffer = new StringBuilder("");
457         try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
458             lexer.trimTrailingSpaces(buffer);
459             assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
460         }
461     }
462 }