View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.csv;
18  
19  import java.io.IOException;
20  import java.io.Reader;
21  import java.io.InputStreamReader;
22  import java.io.InputStream;
23  import java.util.ArrayList;
24  
25  
26  /**
27   * Parses CSV files according to the specified configuration.
28   *
29   * Because CSV appears in many different dialects, the parser supports many
30   * configuration settings by allowing the specification of a {@link CSVStrategy}.
31   * 
32   * <p>Parsing of a csv-string having tabs as separators,
33   * '"' as an optional value encapsulator, and comments starting with '#':</p>
34   * <pre>
35   *  String[][] data = 
36   *   (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
37   * </pre>
38   * 
39   * <p>Parsing of a csv-string in Excel CSV format</p>
40   * <pre>
41   *  String[][] data =
42   *   (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
43   * </pre>
44   * 
45   * <p>
46   * Internal parser state is completely covered by the strategy
47   * and the reader-state.</p>
48   * 
49   * <p>see <a href="package-summary.html">package documentation</a> 
50   * for more details</p>
51   */
52  public class CSVParser {
53  
54    /** length of the initial token (content-)buffer */
55    private static final int INITIAL_TOKEN_LENGTH = 50;
56    
57    // the token types
58    /** Token has no valid content, i.e. is in its initilized state. */
59    protected static final int TT_INVALID = -1;
60    /** Token with content, at beginning or in the middle of a line. */
61    protected static final int TT_TOKEN = 0;
62    /** Token (which can have content) when end of file is reached. */
63    protected static final int TT_EOF = 1;
64    /** Token with content when end of a line is reached. */
65    protected static final int TT_EORECORD = 2;
66  
67    /** Immutable empty String array. */
68    private static final String[] EMPTY_STRING_ARRAY = new String[0];
69     
70    // the input stream
71    private final ExtendedBufferedReader in;
72  
73    // TODO: this can be made final if setStrategy is removed
74    private CSVStrategy strategy;
75    
76    // the following objects are shared to reduce garbage 
77    /** A record buffer for getLine(). Grows as necessary and is reused. */
78    private final ArrayList record = new ArrayList();
79    private final Token reusableToken = new Token();
80    private final CharBuffer wsBuf = new CharBuffer();
81    private final CharBuffer code = new CharBuffer(4);
82  
83    
84    /**
85     * Token is an internal token representation.
86     * 
87     * It is used as contract between the lexer and the parser. 
88     */
89    static class Token {
90      /** Token type, see TT_xxx constants. */
91      int type = TT_INVALID;
92      /** The content buffer. */
93      CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
94      /** Token ready flag: indicates a valid token with content (ready for the parser). */
95      boolean isReady;
96      
97      Token reset() {
98          content.clear();
99          type = TT_INVALID;
100         isReady = false;
101         return this;
102     }
103   }
104   
105   // ======================================================
106   //  the constructor
107   // ======================================================
108   
109   /**
110    * Default strategy for the parser follows the default {@link CSVStrategy}.
111    * 
112    * @param input an InputStream containing "csv-formatted" stream
113    * @deprecated use {@link #CSVParser(Reader)}.
114    */
115   public CSVParser(InputStream input) {
116     this(new InputStreamReader(input));
117   }
118   
119   /**
120    * CSV parser using the default {@link CSVStrategy}.
121    * 
122    * @param input a Reader containing "csv-formatted" input
123    */
124   public CSVParser(Reader input) {
125     // note: must match default-CSV-strategy !!
126     this(input, ',');
127   }
128   
129   /**
130    * Customized value delimiter parser.
131    * 
132    * The parser follows the default {@link CSVStrategy}
133    * except for the delimiter setting.
134    * 
135    * @param input a Reader based on "csv-formatted" input
136    * @param delimiter a Char used for value separation
137    * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
138    */
139   public CSVParser(Reader input, char delimiter) {
140     this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
141   }
142   
143   /**
144    * Customized csv parser.
145    * 
146    * The parser parses according to the given CSV dialect settings.
147    * Leading whitespaces are truncated, unicode escapes are
148    * not interpreted and empty lines are ignored.
149    * 
150    * @param input a Reader based on "csv-formatted" input
151    * @param delimiter a Char used for value separation
152    * @param encapsulator a Char used as value encapsulation marker
153    * @param commentStart a Char used for comment identification
154    * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
155    */
156   public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
157     this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
158   }
159 
160   /**
161    * Customized CSV parser using the given {@link CSVStrategy}
162    *
163    * @param input a Reader containing "csv-formatted" input
164    * @param strategy the CSVStrategy used for CSV parsing
165    */
166   public CSVParser(Reader input, CSVStrategy strategy) {
167     this.in = new ExtendedBufferedReader(input);
168     this.strategy = strategy;
169   }
170   
171   // ======================================================
172   //  the parser
173   // ======================================================
174   
175   /**
176    * Parses the CSV according to the given strategy
177    * and returns the content as an array of records
178    * (whereas records are arrays of single values).
179    * <p>
180    * The returned content starts at the current parse-position in
181    * the stream.
182    * 
183    * @return matrix of records x values ('null' when end of file)
184    * @throws IOException on parse error or input read-failure
185    */
186   public String[][] getAllValues() throws IOException {
187     ArrayList records = new ArrayList();
188     String[] values;
189     String[][] ret = null;
190     while ((values = getLine()) != null)  {
191       records.add(values);
192     }
193     if (records.size() > 0) {
194       ret = new String[records.size()][];
195       records.toArray(ret);
196     }
197     return ret;
198   }
199   
200   /**
201    * Parses the CSV according to the given strategy
202    * and returns the next csv-value as string.
203    * 
204    * @return next value in the input stream ('null' when end of file)
205    * @throws IOException on parse error or input read-failure
206    */
207   public String nextValue() throws IOException {
208     Token tkn = nextToken();
209     String ret = null;
210     switch (tkn.type) {
211       case TT_TOKEN:
212       case TT_EORECORD: 
213         ret = tkn.content.toString();
214         break;
215       case TT_EOF:
216         ret = null;
217         break;
218       case TT_INVALID:
219       default:
220         // error no token available (or error)
221         throw new IOException(
222           "(line " + getLineNumber() 
223           + ") invalid parse sequence");
224         // unreachable: break;
225     }
226     return ret;
227   }
228   
229   /**
230    * Parses from the current point in the stream til
231    * the end of the current line.
232    * 
233    * @return array of values til end of line 
234    *        ('null' when end of file has been reached)
235    * @throws IOException on parse error or input read-failure
236    */
237   public String[] getLine() throws IOException {
238     String[] ret = EMPTY_STRING_ARRAY;
239     record.clear();
240     while (true) {
241         reusableToken.reset();
242         nextToken(reusableToken);
243         switch (reusableToken.type) {
244             case TT_TOKEN:
245                 record.add(reusableToken.content.toString());
246                 break;
247             case TT_EORECORD:
248                 record.add(reusableToken.content.toString());
249                 break;
250             case TT_EOF:
251                 if (reusableToken.isReady) {
252                     record.add(reusableToken.content.toString());
253                 } else {
254                     ret = null;
255                 }
256                 break;
257             case TT_INVALID:
258             default:
259                 // error: throw IOException
260                 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
261             // unreachable: break;
262         }
263         if (reusableToken.type != TT_TOKEN) {
264             break;
265         }
266     }
267     if (!record.isEmpty()) {
268       ret = (String[]) record.toArray(new String[record.size()]);
269     }
270     return ret;
271   }
272   
273   /**
274    * Returns the current line number in the input stream.
275    * 
276    * ATTENTION: in case your csv has multiline-values the returned
277    *            number does not correspond to the record-number
278    * 
279    * @return  current line number
280    */
281   public int getLineNumber() {
282     return in.getLineNumber();  
283   }
284   
285   // ======================================================
286   //  the lexer(s)
287   // ======================================================
288  
289   /**
290    * Convenience method for <code>nextToken(null)</code>.
291    */
292   protected Token nextToken() throws IOException {
293       return nextToken(new Token());
294   }
295   
296  /**
297    * Returns the next token.
298    * 
299    * A token corresponds to a term, a record change or an
300    * end-of-file indicator.
301    * 
302    * @param tkn an existing Token object to reuse. The caller is responsible to initialize the
303    * Token.
304    * @return the next token found
305    * @throws IOException on stream access error
306    */
307   protected Token nextToken(Token tkn) throws IOException {
308     wsBuf.clear(); // resuse
309     
310     // get the last read char (required for empty line detection)
311     int lastChar = in.readAgain();
312     
313     //  read the next char and set eol
314     /* note: unfourtunately isEndOfLine may consumes a character silently.
315      *       this has no effect outside of the method. so a simple workaround
316      *       is to call 'readAgain' on the stream...
317      *       uh: might using objects instead of base-types (jdk1.5 autoboxing!)
318      */
319     int c = in.read();
320     boolean eol = isEndOfLine(c);
321     c = in.readAgain();
322      
323     //  empty line detection: eol AND (last char was EOL or beginning)
324     while (strategy.getIgnoreEmptyLines() && eol 
325       && (lastChar == '\n' 
326       || lastChar == ExtendedBufferedReader.UNDEFINED) 
327       && !isEndOfFile(lastChar)) {
328       // go on char ahead ...
329       lastChar = c;
330       c = in.read();
331       eol = isEndOfLine(c);
332       c = in.readAgain();
333       // reached end of file without any content (empty line at the end)
334       if (isEndOfFile(c)) {
335         tkn.type = TT_EOF;
336         return tkn;
337       }
338     }
339 
340     // did we reached eof during the last iteration already ? TT_EOF
341     if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
342       tkn.type = TT_EOF;
343       return tkn;
344     } 
345     
346     //  important: make sure a new char gets consumed in each iteration
347     while (!tkn.isReady) {
348       // ignore whitespaces at beginning of a token
349       while (isWhitespace(c) && !eol) {
350         wsBuf.append((char) c);
351         c = in.read();
352         eol = isEndOfLine(c);
353       }
354       // ok, start of token reached: comment, encapsulated, or token
355       if (c == strategy.getCommentStart()) {
356         // ignore everything till end of line and continue (incr linecount)
357         in.readLine();
358         tkn = nextToken(tkn.reset());
359       } else if (c == strategy.getDelimiter()) {
360         // empty token return TT_TOKEN("")
361         tkn.type = TT_TOKEN;
362         tkn.isReady = true;
363       } else if (eol) {
364         // empty token return TT_EORECORD("")
365         //noop: tkn.content.append("");
366         tkn.type = TT_EORECORD;
367         tkn.isReady = true;
368       } else if (c == strategy.getEncapsulator()) {
369         // consume encapsulated token
370         encapsulatedTokenLexer(tkn, c);
371       } else if (isEndOfFile(c)) {
372         // end of file return TT_EOF()
373         //noop: tkn.content.append("");
374         tkn.type = TT_EOF;
375         tkn.isReady = true;
376       } else {
377         // next token must be a simple token
378         // add removed blanks when not ignoring whitespace chars...
379         if (!strategy.getIgnoreLeadingWhitespaces()) {
380           tkn.content.append(wsBuf);
381         }
382         simpleTokenLexer(tkn, c);
383       }
384     }
385     return tkn;  
386   }
387   
388   /**
389    * A simple token lexer
390    * 
391    * Simple token are tokens which are not surrounded by encapsulators.
392    * A simple token might contain escaped delimiters (as \, or \;). The
393    * token is finished when one of the following conditions become true:
394    * <ul>
395    *   <li>end of line has been reached (TT_EORECORD)</li>
396    *   <li>end of stream has been reached (TT_EOF)</li>
397    *   <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
398    * </ul>
399    *  
400    * @param tkn  the current token
401    * @param c    the current character
402    * @return the filled token
403    * 
404    * @throws IOException on stream access error
405    */
406   private Token simpleTokenLexer(Token tkn, int c) throws IOException {
407     for (;;) {
408       if (isEndOfLine(c)) {
409         // end of record
410         tkn.type = TT_EORECORD;
411         tkn.isReady = true;
412         break;
413       } else if (isEndOfFile(c)) {
414         // end of file
415         tkn.type = TT_EOF;
416         tkn.isReady = true;
417         break;
418       } else if (c == strategy.getDelimiter()) {
419         // end of token
420         tkn.type = TT_TOKEN;
421         tkn.isReady = true;
422         break;
423       } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
424         // interpret unicode escaped chars (like \u0070 -> p)
425         tkn.content.append((char) unicodeEscapeLexer(c));
426       } else if (c == strategy.getEscape()) {
427         tkn.content.append((char)readEscape(c));
428       } else {
429         tkn.content.append((char) c);
430       }
431       
432       c = in.read();
433     }
434 
435     if (strategy.getIgnoreTrailingWhitespaces()) {
436       tkn.content.trimTrailingWhitespace();
437     }
438 
439     return tkn;
440   }
441   
442   
443   /**
444    * An encapsulated token lexer
445    * 
446    * Encapsulated tokens are surrounded by the given encapsulating-string.
447    * The encapsulator itself might be included in the token using a
448    * doubling syntax (as "", '') or using escaping (as in \", \').
449    * Whitespaces before and after an encapsulated token are ignored.
450    * 
451    * @param tkn    the current token
452    * @param c      the current character
453    * @return a valid token object
454    * @throws IOException on invalid state
455    */
456   private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
457     // save current line
458     int startLineNumber = getLineNumber();
459     // ignore the given delimiter
460     // assert c == delimiter;
461     for (;;) {
462       c = in.read();
463 
464       if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
465         tkn.content.append((char) unicodeEscapeLexer(c));
466       } else if (c == strategy.getEscape()) {
467         tkn.content.append((char)readEscape(c));
468       } else if (c == strategy.getEncapsulator()) {
469         if (in.lookAhead() == strategy.getEncapsulator()) {
470           // double or escaped encapsulator -> add single encapsulator to token
471           c = in.read();
472           tkn.content.append((char) c);
473         } else {
474           // token finish mark (encapsulator) reached: ignore whitespace till delimiter
475           for (;;) {
476             c = in.read();
477             if (c == strategy.getDelimiter()) {
478               tkn.type = TT_TOKEN;
479               tkn.isReady = true;
480               return tkn;
481             } else if (isEndOfFile(c)) {
482               tkn.type = TT_EOF;
483               tkn.isReady = true;
484               return tkn;
485             } else if (isEndOfLine(c)) {
486               // ok eo token reached
487               tkn.type = TT_EORECORD;
488               tkn.isReady = true;
489               return tkn;
490             } else if (!isWhitespace(c)) {
491               // error invalid char between token and next delimiter
492               throw new IOException(
493                       "(line " + getLineNumber()
494                               + ") invalid char between encapsulated token end delimiter"
495               );
496             }
497           }
498         }
499       } else if (isEndOfFile(c)) {
500         // error condition (end of file before end of token)
501         throw new IOException(
502                 "(startline " + startLineNumber + ")"
503                         + "eof reached before encapsulated token finished"
504         );
505       } else {
506         // consume character
507         tkn.content.append((char) c);
508       }
509     }
510   }
511   
512   
513   /**
514    * Decodes Unicode escapes.
515    * 
516    * Interpretation of "\\uXXXX" escape sequences
517    * where XXXX is a hex-number.
518    * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
519    * @return the decoded character
520    * @throws IOException on wrong unicode escape sequence or read error
521    */
522   protected int unicodeEscapeLexer(int c) throws IOException {
523     int ret = 0;
524     // ignore 'u' (assume c==\ now) and read 4 hex digits
525     c = in.read();
526     code.clear();
527     try {
528       for (int i = 0; i < 4; i++) {
529         c  = in.read();
530         if (isEndOfFile(c) || isEndOfLine(c)) {
531           throw new NumberFormatException("number too short");
532         }
533         code.append((char) c);
534       }
535       ret = Integer.parseInt(code.toString(), 16);
536     } catch (NumberFormatException e) {
537       throw new IOException(
538         "(line " + getLineNumber() + ") Wrong unicode escape sequence found '" 
539         + code.toString() + "'" + e.toString());
540     }
541     return ret;
542   }
543 
544   private int readEscape(int c) throws IOException {
545     // assume c is the escape char (normally a backslash)
546     c = in.read();
547     int out;
548     switch (c) {
549       case 'r': out='\r'; break;
550       case 'n': out='\n'; break;
551       case 't': out='\t'; break;
552       case 'b': out='\b'; break;
553       case 'f': out='\f'; break;
554       default : out=c;
555     }
556     return out;
557   }
558   
559   // ======================================================
560   //  strategies
561   // ======================================================
562   
563   /**
564    * Sets the specified CSV Strategy
565    *
566    * @return current instance of CSVParser to allow chained method calls
567    * @deprecated the strategy should be set in the constructor {@link #CSVParser(Reader,CSVStrategy)}.
568    */
569   public CSVParser setStrategy(CSVStrategy strategy) {
570     this.strategy = strategy;
571     return this;
572   }
573   
574   /**
575    * Obtain the specified CSV Strategy
576    * 
577    * @return strategy currently being used
578    */
579   public CSVStrategy getStrategy() {
580     return this.strategy;
581   }
582   
583   // ======================================================
584   //  Character class checker
585   // ======================================================
586   
587   /**
588    * @return true if the given char is a whitespace character
589    */
590   private boolean isWhitespace(int c) {
591     return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
592   }
593   
594   /**
595    * Greedy - accepts \n and \r\n 
596    * This checker consumes silently the second control-character...
597    * 
598    * @return true if the given character is a line-terminator
599    */
600   private boolean isEndOfLine(int c) throws IOException {
601     // check if we have \r\n...
602     if (c == '\r') {
603       if (in.lookAhead() == '\n') {
604         // note: does not change c outside of this method !!
605         c = in.read();
606       }
607     }
608     return (c == '\n');
609   }
610   
611   /**
612    * @return true if the given character indicates end of file
613    */
614   private boolean isEndOfFile(int c) {
615     return c == ExtendedBufferedReader.END_OF_STREAM;
616   }
617 }