View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Constants.BACKSPACE;
21  import static org.apache.commons.csv.Constants.CR;
22  import static org.apache.commons.csv.Constants.FF;
23  import static org.apache.commons.csv.Constants.LF;
24  import static org.apache.commons.csv.Constants.TAB;
25  import static org.apache.commons.csv.Token.Type.COMMENT;
26  import static org.apache.commons.csv.Token.Type.EOF;
27  import static org.apache.commons.csv.Token.Type.EORECORD;
28  import static org.apache.commons.csv.Token.Type.TOKEN;
29  import static org.apache.commons.csv.TokenMatchers.hasContent;
30  import static org.apache.commons.csv.TokenMatchers.matches;
31  import static org.hamcrest.MatcherAssert.assertThat;
32  import static org.junit.jupiter.api.Assertions.assertEquals;
33  import static org.junit.jupiter.api.Assertions.assertFalse;
34  import static org.junit.jupiter.api.Assertions.assertThrows;
35  import static org.junit.jupiter.api.Assertions.assertTrue;
36  
37  import java.io.IOException;
38  import java.io.StringReader;
39  
40  import org.junit.jupiter.api.BeforeEach;
41  import org.junit.jupiter.api.Test;
42  
43  /**
44   *
45   */
46  public class LexerTest {
47  
48      private CSVFormat formatWithEscaping;
49  
50      @SuppressWarnings("resource")
51      private Lexer createLexer(final String input, final CSVFormat format) {
52          return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
53      }
54  
55      @BeforeEach
56      public void setUp() {
57          formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
58      }
59  
60      // simple token with escaping enabled
61      @Test
62      public void testBackslashWithEscaping() throws IOException {
63          /*
64           * file: a,\,,b \,,
65           */
66          final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
67          final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
68          assertTrue(format.isEscapeCharacterSet());
69          try (final Lexer parser = createLexer(code, format)) {
70              assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
71              assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
72              assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
73              assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
74              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
75              assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
76              assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
77          }
78      }
79  
80      // simple token with escaping not enabled
81      @Test
82      public void testBackslashWithoutEscaping() throws IOException {
83          /*
84           * file: a,\,,b \,,
85           */
86          final String code = "a,\\,,b\\\n\\,,";
87          final CSVFormat format = CSVFormat.DEFAULT;
88          assertFalse(format.isEscapeCharacterSet());
89          try (final Lexer parser = createLexer(code, format)) {
90              assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
91              // an unquoted single backslash is not an escape char
92              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
93              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
94              assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
95              // an unquoted single backslash is not an escape char
96              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
97              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
98              assertThat(parser.nextToken(new Token()), matches(EOF, ""));
99          }
100     }
101 
102     @Test
103     public void testBackspace() throws Exception {
104         try (final Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
105             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
106         }
107     }
108 
109     @Test
110     public void testComments() throws IOException {
111         final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
112                 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
113         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
114         try (final Lexer parser = createLexer(code, format)) {
115             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
116             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
117             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
118             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
119             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
120             assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
121             assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
122             assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
123             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
124             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
125             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
126             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
127             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
128             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
129         }
130     }
131 
132     @Test
133     public void testCommentsAndEmptyLines() throws IOException {
134         final String code = "1,2,3,\n" + // 1
135                 "\n" + // 1b
136                 "\n" + // 1c
137                 "a,b x,c#no-comment\n" + // 2
138                 "#foo\n" + // 3
139                 "\n" + // 4
140                 "\n" + // 4b
141                 "d,e,#no-comment\n" + // 5
142                 "\n" + // 5b
143                 "\n" + // 5c
144                 "# penultimate comment\n" + // 6
145                 "\n" + // 6b
146                 "\n" + // 6c
147                 "# Final comment\n"; // 7
148         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
149         assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
150 
151         try (final Lexer parser = createLexer(code, format)) {
152             assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
153             assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
154             assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
155             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1
156             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1b
157             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1c
158             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
159             assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
160             assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment")); // 2
161             assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo")); // 3
162             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4
163             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4b
164             assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
165             assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
166             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment")); // 5
167             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5b
168             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5c
169             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment")); // 6
170             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6b
171             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6c
172             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment")); // 7
173             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
174             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
175         }
176     }
177 
178     @Test
179     public void testCR() throws Exception {
180         try (final Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
181             assertThat(lexer.nextToken(new Token()), hasContent("character"));
182             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
183         }
184     }
185 
186     // From CSV-1
187     @Test
188     public void testDelimiterIsWhitespace() throws IOException {
189         final String code = "one\ttwo\t\tfour \t five\t six";
190         try (final Lexer parser = createLexer(code, CSVFormat.TDF)) {
191             assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
192             assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
193             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
194             assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
195             assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
196             assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
197         }
198     }
199 
200     @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
201     public void testEscapedBackspace() throws Exception {
202         try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
203             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
204         }
205     }
206 
207     @Test
208     public void testEscapedCharacter() throws Exception {
209         try (final Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
210             assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
211         }
212     }
213 
214     @Test
215     public void testEscapedControlCharacter() throws Exception {
216         // we are explicitly using an escape different from \ here
217         try (final Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
218             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
219         }
220     }
221 
222     @Test
223     public void testEscapedControlCharacter2() throws Exception {
224         try (final Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
225             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
226         }
227     }
228 
229     @Test
230     public void testEscapedCR() throws Exception {
231         try (final Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
232             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
233         }
234     }
235 
236     @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
237     public void testEscapedFF() throws Exception {
238         try (final Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
239             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
240         }
241     }
242 
243     @Test
244     public void testEscapedLF() throws Exception {
245         try (final Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
246             assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
247         }
248     }
249 
250     @Test
251     public void testEscapedMySqlNullValue() throws Exception {
252         // MySQL uses \N to symbolize null values. We have to restore this
253         try (final Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
254             assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
255         }
256     }
257 
258     @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
259     public void testEscapedTab() throws Exception {
260         try (final Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
261             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
262         }
263 
264     }
265 
266     @Test
267     public void testEscapingAtEOF() throws Exception {
268         final String code = "escaping at EOF is evil\\";
269         try (final Lexer lexer = createLexer(code, formatWithEscaping)) {
270             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
271         }
272     }
273 
274     @Test
275     public void testFF() throws Exception {
276         try (final Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
277             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
278         }
279     }
280 
281     @Test
282     public void testIgnoreEmptyLines() throws IOException {
283         final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
284                 "\n" + "last, line \n" + "\n" + "\n" + "\n";
285         final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
286         try (final Lexer parser = createLexer(code, format)) {
287             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
288             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
289             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
290             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
291             assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
292             assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
293             assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
294             assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
295             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
296             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
297         }
298     }
299 
300     @Test
301     public void testIsMetaCharCommentStart() throws IOException {
302         try (final Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
303             final int ch = lexer.readEscape();
304             assertEquals('#', ch);
305         }
306     }
307 
308     @Test
309     public void testLF() throws Exception {
310         try (final Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
311             assertThat(lexer.nextToken(new Token()), hasContent("character"));
312             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
313         }
314     }
315 
316     // encapsulator tokenizer (single line)
317     @Test
318     public void testNextToken4() throws IOException {
319         /*
320          * file: a,"foo",b a, " foo",b a,"foo " ,b // whitespace after closing encapsulator a, " foo " ,b
321          */
322         final String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
323         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
324             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
325             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
326             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
327             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
328             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
329             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
330             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
331             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
332             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
333             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
334             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
335             // assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
336             assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
337         }
338     }
339 
340     // encapsulator tokenizer (multi line, delimiter in string)
341     @Test
342     public void testNextToken5() throws IOException {
343         final String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
344         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
345             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
346             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
347             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
348             assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n  baar ,,,"));
349             assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
350         }
351     }
352 
353     // change delimiters, comment, encapsulater
354     @Test
355     public void testNextToken6() throws IOException {
356         /*
357          * file: a;'b and \' more ' !comment;;;; ;;
358          */
359         final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
360         final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
361         try (final Lexer parser = createLexer(code, format)) {
362             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
363             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
364         }
365     }
366 
367     @Test
368     public void testReadEscapeBackspace() throws IOException {
369         try (final Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
370             final int ch = lexer.readEscape();
371             assertEquals(BACKSPACE, ch);
372         }
373     }
374 
375     @Test
376     public void testReadEscapeFF() throws IOException {
377         try (final Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
378             final int ch = lexer.readEscape();
379             assertEquals(FF, ch);
380         }
381     }
382 
383     @Test
384     public void testReadEscapeTab() throws IOException {
385         try (final Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
386             final int ch = lexer.readEscape();
387             assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
388             assertEquals(TAB, ch);
389         }
390     }
391 
392     @Test
393     public void testSurroundingSpacesAreDeleted() throws IOException {
394         final String code = "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,,";
395         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
396             assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
397             assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
398             assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
399             assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
400             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
401             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
402             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
403         }
404     }
405 
406     @Test
407     public void testSurroundingTabsAreDeleted() throws IOException {
408         final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
409         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
410             assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
411             assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
412             assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
413             assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
414             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
415             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
416             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
417         }
418     }
419 
420     @Test
421     public void testTab() throws Exception {
422         try (final Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
423             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
424         }
425     }
426 
427     @Test
428     public void testTrimTrailingSpacesZeroLength() throws Exception {
429         final StringBuilder buffer = new StringBuilder("");
430         final Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT);
431         lexer.trimTrailingSpaces(buffer);
432         assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
433     }
434 }