View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import java.io.IOException;
21  import java.io.Reader;
22  import java.io.StringReader;
23  import java.util.ArrayList;
24  import java.util.Iterator;
25  import java.util.List;
26  import java.util.NoSuchElementException;
27  
28  import org.apache.commons.csv.CSVLexer.Token;
29  
30  import static org.apache.commons.csv.CSVLexer.Token.Type.*;
31  
32  /**
33   * Parses CSV files according to the specified configuration.
34   *
35   * Because CSV appears in many different dialects, the parser supports many
36   * configuration settings by allowing the specification of a {@link CSVFormat}.
37   *
38   * <p>Parsing of a csv-string having tabs as separators,
39   * '"' as an optional value encapsulator, and comments starting with '#':</p>
40   * <pre>
41   * CSVFormat format = new CSVFormat('\t', '"', '#');
42   * Reader in = new StringReader("a\tb\nc\td");
43   * String[][] records = new CSVParser(in, format).getRecords();
44   * </pre>
45   *
46   * <p>Parsing of a csv-string in Excel CSV format, using a for-each loop:</p>
47   * <pre>
48   * Reader in = new StringReader("a;b\nc;d");
49   * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
50   * for (String[] record : parser) {
51   *     ...
52   * }
53   * </pre>
54   *
55   * <p>
56   * Internal parser state is completely covered by the format
57   * and the reader-state.</p>
58   *
59   * <p>see <a href="package-summary.html">package documentation</a>
60   * for more details</p>
61   */
62  public class CSVParser implements Iterable<String[]> {
63  
64      /** Immutable empty String array. */
65      private static final String[] EMPTY_STRING_ARRAY = new String[0];
66  
67      private final CSVLexer lexer;
68      
69      // the following objects are shared to reduce garbage
70      
71      /** A record buffer for getRecord(). Grows as necessary and is reused. */
72      private final List<String> record = new ArrayList<String>();
73      private final Token reusableToken = new Token();
74  
75      /**
76       * CSV parser using the default {@link CSVFormat}.
77       *
78       * @param input a Reader containing "csv-formatted" input
79       * @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
80       */
81      public CSVParser(Reader input) {
82          this(input, CSVFormat.DEFAULT);
83      }
84  
85      /**
86       * Customized CSV parser using the given {@link CSVFormat}
87       *
88       * @param input    a Reader containing "csv-formatted" input
89       * @param format the CSVFormat used for CSV parsing
90       * @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
91       */
92      public CSVParser(Reader input, CSVFormat format) {
93          format.validate();
94          
95          if (format.isUnicodeEscapesInterpreted()) {
96              input = new UnicodeUnescapeReader(input);
97          }
98          
99          this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
100     }
101 
102     /**
103      * Customized CSV parser using the given {@link CSVFormat}
104      *
105      * @param input    a String containing "csv-formatted" input
106      * @param format the CSVFormat used for CSV parsing
107      * @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
108      */
109     public CSVParser(String input, CSVFormat format) {
110         this(new StringReader(input), format);
111     }
112 
113 
114     /**
115      * Parses the CSV input according to the given format and returns the content
116      * as an array of records (whereas records are arrays of single values).
117      * <p/>
118      * The returned content starts at the current parse-position in the stream.
119      *
120      * @return matrix of records x values ('null' when end of file)
121      * @throws IOException on parse error or input read-failure
122      */
123     public String[][] getRecords() throws IOException {
124         List<String[]> records = new ArrayList<String[]>();
125         String[] record;
126         while ((record = getRecord()) != null) {
127             records.add(record);
128         }
129         
130         if (!records.isEmpty()) {
131             return records.toArray(new String[records.size()][]);
132         } else {
133             return null;
134         }
135     }
136 
137     /**
138      * Parses the next record from the current point in the stream.
139      *
140      * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
141      * @throws IOException on parse error or input read-failure
142      */
143     String[] getRecord() throws IOException {
144         String[] result = EMPTY_STRING_ARRAY;
145         record.clear();
146         while (true) {
147             reusableToken.reset();
148             lexer.nextToken(reusableToken);
149             switch (reusableToken.type) {
150                 case TOKEN:
151                     record.add(reusableToken.content.toString());
152                     break;
153                 case EORECORD:
154                     record.add(reusableToken.content.toString());
155                     break;
156                 case EOF:
157                     if (reusableToken.isReady) {
158                         record.add(reusableToken.content.toString());
159                     } else {
160                         result = null;
161                     }
162                     break;
163                 case INVALID:
164                     // error: throw IOException
165                     throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
166                     // unreachable: break;
167             }
168             if (reusableToken.type != TOKEN) {
169                 break;
170             }
171         }
172         if (!record.isEmpty()) {
173             result = record.toArray(new String[record.size()]);
174         }
175         return result;
176     }
177 
178     /**
179      * Returns an iterator on the records. IOExceptions occuring
180      * during the iteration are wrapped in a RuntimeException.
181      */
182     public Iterator<String[]> iterator() {
183         return new Iterator<String[]>() {
184             String[] current;
185             
186             public boolean hasNext() {
187                 if (current == null) {
188                     current = getNextLine();
189                 }
190                 
191                 return current != null;
192             }
193 
194             public String[] next() {
195                 String[] next = current;
196                 current = null;
197 
198                 if (next == null) {
199                     // hasNext() wasn't called before
200                     next = getNextLine();
201                     if (next == null) {
202                         throw new NoSuchElementException("No more CSV records available");
203                     }
204                 }
205                 
206                 return next;
207             }
208             
209             private String[] getNextLine() {
210                 try {
211                     return getRecord();
212                 } catch (IOException e) {
213                     throw new RuntimeException(e);
214                 }
215             }
216 
217             public void remove() { }
218         };
219     }
220 
221     /**
222      * Returns the current line number in the input stream.
223      * <p/>
224      * ATTENTION: in case your csv has multiline-values the returned
225      * number does not correspond to the record-number
226      *
227      * @return current line number
228      */
229     public int getLineNumber() {
230         return lexer.getLineNumber();
231     }
232 }
233 
234 
235 class CSVLexer {
236 
237     /** length of the initial token (content-)buffer */
238     private static final int INITIAL_TOKEN_LENGTH = 50;
239     
240     private final CharBuffer wsBuf = new CharBuffer();
241     
242     private final CSVFormat format;
243     
244     /** The input stream */
245     private final ExtendedBufferedReader in;
246 
247     /**
248      * Token is an internal token representation.
249      * <p/>
250      * It is used as contract between the lexer and the parser.
251      */
252     static class Token {
253 
254         enum Type {
255             /** Token has no valid content, i.e. is in its initialized state. */
256             INVALID,
257             
258             /** Token with content, at beginning or in the middle of a line. */
259             TOKEN,
260             
261             /** Token (which can have content) when end of file is reached. */
262             EOF,
263             
264             /** Token with content when end of a line is reached. */
265             EORECORD
266         }
267         
268         /** Token type */
269         Type type = INVALID;
270         
271         /** The content buffer. */
272         CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
273         
274         /** Token ready flag: indicates a valid token with content (ready for the parser). */
275         boolean isReady;
276 
277         Token reset() {
278             content.clear();
279             type = INVALID;
280             isReady = false;
281             return this;
282         }
283     }
284 
285     CSVLexer(CSVFormat format, ExtendedBufferedReader in) {
286         this.format = format;
287         this.in = in;
288     }
289 
290     public int getLineNumber() {
291         return in.getLineNumber();
292     }
293 
294     /**
295      * Returns the next token.
296      * <p/>
297      * A token corresponds to a term, a record change or an end-of-file indicator.
298      *
299      * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
300      * @return the next token found
301      * @throws IOException on stream access error
302      */
303     Token nextToken(Token tkn) throws IOException {
304         wsBuf.clear(); // reuse
305 
306         // get the last read char (required for empty line detection)
307         int lastChar = in.readAgain();
308 
309         //  read the next char and set eol
310         /* note: unfortunately isEndOfLine may consumes a character silently.
311         *       this has no effect outside of the method. so a simple workaround
312         *       is to call 'readAgain' on the stream...
313         *       uh: might using objects instead of base-types (jdk1.5 autoboxing!)
314         */
315         int c = in.read();
316         boolean eol = isEndOfLine(c);
317         c = in.readAgain();
318 
319         //  empty line detection: eol AND (last char was EOL or beginning)
320         while (format.isEmptyLinesIgnored() && eol
321                 && (lastChar == '\n'
322                 || lastChar == '\r'
323                 || lastChar == ExtendedBufferedReader.UNDEFINED)
324                 && !isEndOfFile(lastChar)) {
325             // go on char ahead ...
326             lastChar = c;
327             c = in.read();
328             eol = isEndOfLine(c);
329             c = in.readAgain();
330             // reached end of file without any content (empty line at the end)
331             if (isEndOfFile(c)) {
332                 tkn.type = EOF;
333                 return tkn;
334             }
335         }
336 
337         // did we reach eof during the last iteration already ? EOF
338         if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) {
339             tkn.type = EOF;
340             return tkn;
341         }
342 
343         //  important: make sure a new char gets consumed in each iteration
344         while (!tkn.isReady && tkn.type != EOF) {
345             // ignore whitespaces at beginning of a token
346             while (format.isLeadingSpacesIgnored() && isWhitespace(c) && !eol) {
347                 wsBuf.append((char) c);
348                 c = in.read();
349                 eol = isEndOfLine(c);
350             }
351             // ok, start of token reached: comment, encapsulated, or token
352             if (c == format.getCommentStart()) {
353                 // ignore everything till end of line and continue (incr linecount)
354                 in.readLine();
355                 tkn = nextToken(tkn.reset());
356             } else if (c == format.getDelimiter()) {
357                 // empty token return TOKEN("")
358                 tkn.type = TOKEN;
359                 tkn.isReady = true;
360             } else if (eol) {
361                 // empty token return EORECORD("")
362                 //noop: tkn.content.append("");
363                 tkn.type = EORECORD;
364                 tkn.isReady = true;
365             } else if (c == format.getEncapsulator()) {
366                 // consume encapsulated token
367                 encapsulatedTokenLexer(tkn, c);
368             } else if (isEndOfFile(c)) {
369                 // end of file return EOF()
370                 //noop: tkn.content.append("");
371                 tkn.type = EOF;
372                 tkn.isReady = true;
373             } else {
374                 // next token must be a simple token
375                 // add removed blanks when not ignoring whitespace chars...
376                 if (!format.isLeadingSpacesIgnored()) {
377                     tkn.content.append(wsBuf);
378                 }
379                 simpleTokenLexer(tkn, c);
380             }
381         }
382         return tkn;
383     }
384 
385     /**
386      * A simple token lexer
387      * <p/>
388      * Simple token are tokens which are not surrounded by encapsulators.
389      * A simple token might contain escaped delimiters (as \, or \;). The
390      * token is finished when one of the following conditions become true:
391      * <ul>
392      *   <li>end of line has been reached (EORECORD)</li>
393      *   <li>end of stream has been reached (EOF)</li>
394      *   <li>an unescaped delimiter has been reached (TOKEN)</li>
395      * </ul>
396      *
397      * @param tkn the current token
398      * @param c   the current character
399      * @return the filled token
400      * @throws IOException on stream access error
401      */
402     private Token simpleTokenLexer(Token tkn, int c) throws IOException {
403         for (; ;) {
404             if (isEndOfLine(c)) {
405                 // end of record
406                 tkn.type = EORECORD;
407                 tkn.isReady = true;
408                 break;
409             } else if (isEndOfFile(c)) {
410                 // end of file
411                 tkn.type = EOF;
412                 tkn.isReady = true;
413                 break;
414             } else if (c == format.getDelimiter()) {
415                 // end of token
416                 tkn.type = TOKEN;
417                 tkn.isReady = true;
418                 break;
419             } else if (c == format.getEscape()) {
420                 tkn.content.append((char) readEscape(c));
421             } else {
422                 tkn.content.append((char) c);
423             }
424 
425             c = in.read();
426         }
427 
428         if (format.isTrailingSpacesIgnored()) {
429             tkn.content.trimTrailingWhitespace();
430         }
431 
432         return tkn;
433     }
434 
435 
436     /**
437      * An encapsulated token lexer
438      * <p/>
439      * Encapsulated tokens are surrounded by the given encapsulating-string.
440      * The encapsulator itself might be included in the token using a
441      * doubling syntax (as "", '') or using escaping (as in \", \').
442      * Whitespaces before and after an encapsulated token are ignored.
443      *
444      * @param tkn the current token
445      * @param c   the current character
446      * @return a valid token object
447      * @throws IOException on invalid state
448      */
449     private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
450         // save current line
451         int startLineNumber = getLineNumber();
452         // ignore the given delimiter
453         // assert c == delimiter;
454         for (; ;) {
455             c = in.read();
456             
457             if (c == format.getEscape()) {
458                 tkn.content.append((char) readEscape(c));
459             } else if (c == format.getEncapsulator()) {
460                 if (in.lookAhead() == format.getEncapsulator()) {
461                     // double or escaped encapsulator -> add single encapsulator to token
462                     c = in.read();
463                     tkn.content.append((char) c);
464                 } else {
465                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
466                     for (; ;) {
467                         c = in.read();
468                         if (c == format.getDelimiter()) {
469                             tkn.type = TOKEN;
470                             tkn.isReady = true;
471                             return tkn;
472                         } else if (isEndOfFile(c)) {
473                             tkn.type = EOF;
474                             tkn.isReady = true;
475                             return tkn;
476                         } else if (isEndOfLine(c)) {
477                             // ok eo token reached
478                             tkn.type = EORECORD;
479                             tkn.isReady = true;
480                             return tkn;
481                         } else if (!isWhitespace(c)) {
482                             // error invalid char between token and next delimiter
483                             throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
484                         }
485                     }
486                 }
487             } else if (isEndOfFile(c)) {
488                 // error condition (end of file before end of token)
489                 throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
490             } else {
491                 // consume character
492                 tkn.content.append((char) c);
493             }
494         }
495     }
496 
497     private int readEscape(int c) throws IOException {
498         // assume c is the escape char (normally a backslash)
499         c = in.read();
500         switch (c) {
501             case 'r':
502                 return '\r';
503             case 'n':
504                 return '\n';
505             case 't':
506                 return '\t';
507             case 'b':
508                 return '\b';
509             case 'f':
510                 return '\f';
511             default:
512                 return c;
513         }
514     }
515 
516     /**
517      * @return true if the given char is a whitespace character
518      */
519     private boolean isWhitespace(int c) {
520         return Character.isWhitespace((char) c) && (c != format.getDelimiter());
521     }
522 
523     /**
524      * Greedy - accepts \n, \r and \r\n
525      * This checker consumes silently the second control-character...
526      *
527      * @return true if the given character is a line-terminator
528      */
529     private boolean isEndOfLine(int c) throws IOException {
530         // check if we have \r\n...
531         if (c == '\r' && in.lookAhead() == '\n') {
532             // note: does not change c outside of this method !!
533             c = in.read();
534         }
535         return (c == '\n' || c == '\r');
536     }
537 
538     /**
539      * @return true if the given character indicates end of file
540      */
541     private boolean isEndOfFile(int c) {
542         return c == ExtendedBufferedReader.END_OF_STREAM;
543     }
544 }