View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Constants.BACKSPACE;
21  import static org.apache.commons.csv.Constants.CR;
22  import static org.apache.commons.csv.Constants.FF;
23  import static org.apache.commons.csv.Constants.LF;
24  import static org.apache.commons.csv.Constants.TAB;
25  import static org.apache.commons.csv.Token.Type.COMMENT;
26  import static org.apache.commons.csv.Token.Type.EOF;
27  import static org.apache.commons.csv.Token.Type.EORECORD;
28  import static org.apache.commons.csv.Token.Type.TOKEN;
29  import static org.apache.commons.csv.TokenMatchers.hasContent;
30  import static org.apache.commons.csv.TokenMatchers.matches;
31  import static org.junit.Assert.assertFalse;
32  import static org.junit.Assert.assertThat;
33  import static org.junit.Assert.assertTrue;
34  
35  import java.io.IOException;
36  import java.io.StringReader;
37  
38  import org.junit.Before;
39  import org.junit.Test;
40  
41  /**
42   *
43   */
44  public class LexerTest {
45  
46      private CSVFormat formatWithEscaping;
47  
48      @Before
49      public void setUp() {
50          formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
51      }
52  
53      private Lexer createLexer(final String input, final CSVFormat format) {
54          return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
55      }
56  
57      @Test
58      public void testSurroundingSpacesAreDeleted() throws IOException {
59          final String code = "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,,";
60          try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
61              assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
62              assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
63              assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
64              assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
65              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
66              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
67              assertThat(parser.nextToken(new Token()), matches(EOF, ""));
68          }
69      }
70  
71      @Test
72      public void testSurroundingTabsAreDeleted() throws IOException {
73          final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
74          try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
75              assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
76              assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
77              assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
78              assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
79              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
80              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
81              assertThat(parser.nextToken(new Token()), matches(EOF, ""));
82          }
83      }
84  
85      @Test
86      public void testIgnoreEmptyLines() throws IOException {
87          final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
88                  "\n" + "last, line \n" + "\n" + "\n" + "\n";
89          final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
90          try (final Lexer parser = createLexer(code, format)) {
91              assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
92              assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
93              assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
94              assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
95              assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
96              assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
97              assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
98              assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
99              assertThat(parser.nextToken(new Token()), matches(EOF, ""));
100             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
101         }
102     }
103 
104     @Test
105     public void testComments() throws IOException {
106         final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
107                 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
108         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
109         try (final Lexer parser = createLexer(code, format)) {
110             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
111             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
112             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
113             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
114             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
115             assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
116             assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
117             assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
118             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
119             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
120             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
121             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
122             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
123             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
124         }
125     }
126 
127     @Test
128     public void testCommentsAndEmptyLines() throws IOException {
129         final String code = "1,2,3,\n" + // 1
130                 "\n" + // 1b
131                 "\n" + // 1c
132                 "a,b x,c#no-comment\n" + // 2
133                 "#foo\n" + // 3
134                 "\n" + // 4
135                 "\n" + // 4b
136                 "d,e,#no-comment\n" + // 5
137                 "\n" + // 5b
138                 "\n" + // 5c
139                 "# penultimate comment\n" + // 6
140                 "\n" + // 6b
141                 "\n" + // 6c
142                 "# Final comment\n"; // 7
143         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
144         assertFalse("Should not ignore empty lines", format.getIgnoreEmptyLines());
145 
146         try (final Lexer parser = createLexer(code, format)) {
147             assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
148             assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
149             assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
150             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1
151             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1b
152             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1c
153             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
154             assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
155             assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment")); // 2
156             assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo")); // 3
157             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4
158             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4b
159             assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
160             assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
161             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment")); // 5
162             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5b
163             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5c
164             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment")); // 6
165             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6b
166             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6c
167             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment")); // 7
168             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
169             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
170         }
171     }
172 
173     // simple token with escaping not enabled
174     @Test
175     public void testBackslashWithoutEscaping() throws IOException {
176         /*
177          * file: a,\,,b \,,
178          */
179         final String code = "a,\\,,b\\\n\\,,";
180         final CSVFormat format = CSVFormat.DEFAULT;
181         assertFalse(format.isEscapeCharacterSet());
182         try (final Lexer parser = createLexer(code, format)) {
183             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
184             // an unquoted single backslash is not an escape char
185             assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
186             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
187             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
188             // an unquoted single backslash is not an escape char
189             assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
190             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
191             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
192         }
193     }
194 
195     // simple token with escaping enabled
196     @Test
197     public void testBackslashWithEscaping() throws IOException {
198         /*
199          * file: a,\,,b \,,
200          */
201         final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
202         final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
203         assertTrue(format.isEscapeCharacterSet());
204         try (final Lexer parser = createLexer(code, format)) {
205             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
206             assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
207             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
208             assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
209             assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
210             assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
211             assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
212         }
213     }
214 
215     // encapsulator tokenizer (single line)
216     @Test
217     public void testNextToken4() throws IOException {
218         /*
219          * file: a,"foo",b a, " foo",b a,"foo " ,b // whitespace after closing encapsulator a, " foo " ,b
220          */
221         final String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
222         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
223             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
224             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
225             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
226             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
227             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
228             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
229             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
230             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
231             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
232             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
233             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
234             // assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
235             assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
236         }
237     }
238 
239     // encapsulator tokenizer (multi line, delimiter in string)
240     @Test
241     public void testNextToken5() throws IOException {
242         final String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
243         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
244             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
245             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
246             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
247             assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n  baar ,,,"));
248             assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
249         }
250     }
251 
252     // change delimiters, comment, encapsulater
253     @Test
254     public void testNextToken6() throws IOException {
255         /*
256          * file: a;'b and \' more ' !comment;;;; ;;
257          */
258         final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
259         final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
260         try (final Lexer parser = createLexer(code, format)) {
261             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
262             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
263         }
264     }
265 
266     // From CSV-1
267     @Test
268     public void testDelimiterIsWhitespace() throws IOException {
269         final String code = "one\ttwo\t\tfour \t five\t six";
270         try (final Lexer parser = createLexer(code, CSVFormat.TDF)) {
271             assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
272             assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
273             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
274             assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
275             assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
276             assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
277         }
278     }
279 
280     @Test
281     public void testEscapedCR() throws Exception {
282         try (final Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
283             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
284         }
285     }
286 
287     @Test
288     public void testCR() throws Exception {
289         try (final Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
290             assertThat(lexer.nextToken(new Token()), hasContent("character"));
291             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
292         }
293     }
294 
295     @Test
296     public void testEscapedLF() throws Exception {
297         try (final Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
298             assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
299         }
300     }
301 
302     @Test
303     public void testLF() throws Exception {
304         try (final Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
305             assertThat(lexer.nextToken(new Token()), hasContent("character"));
306             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
307         }
308     }
309 
310     @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
311     public void testEscapedTab() throws Exception {
312         try (final Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
313             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
314         }
315 
316     }
317 
318     @Test
319     public void testTab() throws Exception {
320         try (final Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
321             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
322         }
323     }
324 
325     @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
326     public void testEscapedBackspace() throws Exception {
327         try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
328             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
329         }
330     }
331 
332     @Test
333     public void testBackspace() throws Exception {
334         try (final Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
335             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
336         }
337     }
338 
339     @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
340     public void testEscapedFF() throws Exception {
341         try (final Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
342             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
343         }
344     }
345 
346     @Test
347     public void testFF() throws Exception {
348         try (final Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
349             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
350         }
351     }
352 
353     @Test
354     public void testEscapedMySqlNullValue() throws Exception {
355         // MySQL uses \N to symbolize null values. We have to restore this
356         try (final Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
357             assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
358         }
359     }
360 
361     @Test
362     public void testEscapedCharacter() throws Exception {
363         try (final Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
364             assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
365         }
366     }
367 
368     @Test
369     public void testEscapedControlCharacter() throws Exception {
370         // we are explicitly using an escape different from \ here
371         try (final Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
372             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
373         }
374     }
375 
376     @Test
377     public void testEscapedControlCharacter2() throws Exception {
378         try (final Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
379             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
380         }
381     }
382 
383     @Test(expected = IOException.class)
384     public void testEscapingAtEOF() throws Exception {
385         final String code = "escaping at EOF is evil\\";
386         try (final Lexer lexer = createLexer(code, formatWithEscaping)) {
387             lexer.nextToken(new Token());
388         }
389     }
390 }