View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Token.Type.TOKEN;
21  
22  import java.io.Closeable;
23  import java.io.File;
24  import java.io.FileReader;
25  import java.io.IOException;
26  import java.io.InputStreamReader;
27  import java.io.Reader;
28  import java.io.StringReader;
29  import java.net.URL;
30  import java.nio.charset.Charset;
31  import java.util.ArrayList;
32  import java.util.Iterator;
33  import java.util.LinkedHashMap;
34  import java.util.List;
35  import java.util.Map;
36  import java.util.NoSuchElementException;
37  
38  /**
39   * Parses CSV files according to the specified format.
40   *
41   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
42   * specification of a {@link CSVFormat}.
43   *
44   * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
45   *
46   * <h4>Creating instances</h4>
47   * There are several static factory methods that can be used to create instances for various types of resources:
48   * <p>
49   * <ul>
50   *     <li>{@link #parse(java.io.File, CSVFormat)}</li>
51   *     <li>{@link #parse(String, CSVFormat)}</li>
52   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
53   * </ul>
54   * </p>
55   * <p>
56   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
57   * 
58   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
59   * </p>
60   * <pre>
61   * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
62   *     ...
63   * }
64   * </pre>
65   *
66   * <h4>Parsing record wise</h4>
67   * <p>
68   * To parse a CSV input from a file, you write:
69   * </p>
70   *
71   * <pre>
72   * File csvData = new File(&quot;/path/to/csv&quot;);
73   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
74   * for (CSVRecord csvRecord : parser) {
75   *     ...
76   * }
77   * </pre>
78   *
79   * <p>
80   * This will read the parse the contents of the file using the
81   * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
82   * </p>
83   *
84   * <p>
85   * To parse CSV input in a format like Excel, you write:
86   * </p>
87   *
88   * <pre>
89   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
90   * for (CSVRecord csvRecord : parser) {
91   *     ...
92   * }
93   * </pre>
94   *
95   * <p>
96   * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
97   * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
98   * </p>
99   *
100  * <h4>Parsing into memory</h4>
101  * <p>
102  * If parsing record wise is not desired, the contents of the input can be read completely into memory.
103  * </p>
104  *
105  * <pre>
106  * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
107  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
108  * List&lt;CSVRecord&gt; list = parser.getRecords();
109  * </pre>
110  *
111  * <p>
112  * There are two constraints that have to be kept in mind:
113  * </p>
114  *
115  * <p>
116  * <ol>
117  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
118  *     the input, those records will not end up in the in memory representation of your CSV data.</li>
119  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
120  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
121  * </ol>
122  * </p>
123  *
124  * <h4>Notes</h4>
125  * <p>
126  * Internal parser state is completely covered by the format and the reader-state.
127  * </p>
128  *
129  * @version $Id: CSVParser.java 1559908 2014-01-21 02:44:30Z ggregory $
130  *
131  * @see <a href="package-summary.html">package documentation for more details</a>
132  */
133 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
134 
135     /**
136      * Creates a parser for the given {@link File}.
137      *
138      * @param file
139      *            a CSV file. Must not be null.
140      * @param format
141      *            the CSVFormat used for CSV parsing. Must not be null.
142      * @return a new parser
143      * @throws IllegalArgumentException
144      *             If the parameters of the format are inconsistent or if either file or format are null.
145      * @throws IOException
146      *             If an I/O error occurs
147      */
148     public static CSVParser parse(final File file, final CSVFormat format) throws IOException {
149         Assertions.notNull(file, "file");
150         Assertions.notNull(format, "format");
151 
152         return new CSVParser(new FileReader(file), format);
153     }
154 
155     /**
156      * Creates a parser for the given {@link String}.
157      *
158      * @param string
159      *            a CSV string. Must not be null.
160      * @param format
161      *            the CSVFormat used for CSV parsing. Must not be null.
162      * @return a new parser
163      * @throws IllegalArgumentException
164      *             If the parameters of the format are inconsistent or if either string or format are null.
165      * @throws IOException
166      *             If an I/O error occurs
167      */
168     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
169         Assertions.notNull(string, "string");
170         Assertions.notNull(format, "format");
171 
172         return new CSVParser(new StringReader(string), format);
173     }
174 
175     /**
176      * Creates a parser for the given URL.
177      *
178      * <p>
179      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
180      * you close the {@code url}.
181      * </p>
182      *
183      * @param url
184      *            a URL. Must not be null.
185      * @param charset
186      *            the charset for the resource. Must not be null.
187      * @param format
188      *            the CSVFormat used for CSV parsing. Must not be null.
189      * @return a new parser
190      * @throws IllegalArgumentException
191      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
192      * @throws IOException
193      *             If an I/O error occurs
194      */
195     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
196         Assertions.notNull(url, "url");
197         Assertions.notNull(charset, "charset");
198         Assertions.notNull(format, "format");
199 
200         return new CSVParser(new InputStreamReader(url.openStream(),
201                              charset == null ? Charset.forName("UTF-8") : charset), format);
202     }
203 
204     // the following objects are shared to reduce garbage
205 
206     private final CSVFormat format;
207 
208     /** A mapping of column names to column indices */
209     private final Map<String, Integer> headerMap;
210 
211     private final Lexer lexer;
212 
213     /** A record buffer for getRecord(). Grows as necessary and is reused. */
214     private final List<String> record = new ArrayList<String>();
215 
216     private long recordNumber;
217 
218     private final Token reusableToken = new Token();
219 
220     /**
221      * Customized CSV parser using the given {@link CSVFormat}
222      *
223      * <p>
224      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
225      * unless you close the {@code reader}.
226      * </p>
227      *
228      * @param reader
229      *            a Reader containing CSV-formatted input. Must not be null.
230      * @param format
231      *            the CSVFormat used for CSV parsing. Must not be null.
232      * @throws IllegalArgumentException
233      *             If the parameters of the format are inconsistent or if either reader or format are null.
234      * @throws IOException
235      *             If an I/O error occurs
236      */
237     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
238         Assertions.notNull(reader, "reader");
239         Assertions.notNull(format, "format");
240 
241         format.validate();
242         this.format = format;
243         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
244         this.headerMap = this.initializeHeader();
245     }
246 
247     private void addRecordValue() {
248         final String input = this.reusableToken.content.toString();
249         final String nullString = this.format.getNullString();
250         if (nullString == null) {
251             this.record.add(input);
252         } else {
253             this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
254         }
255     }
256 
257     /**
258      * Closes resources.
259      *
260      * @throws IOException
261      *             If an I/O error occurs
262      */
263     public void close() throws IOException {
264         if (this.lexer != null) {
265             this.lexer.close();
266         }
267     }
268 
269     /**
270      * Returns the current line number in the input stream.
271      * <p/>
272      * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the record number.
273      *
274      * @return current line number
275      */
276     public long getCurrentLineNumber() {
277         return this.lexer.getCurrentLineNumber();
278     }
279 
280     /**
281      * Returns a copy of the header map that iterates in column order.
282      * <p>
283      * The map keys are column names. The map values are 0-based indices.
284      * </p>
285      * @return a copy of the header map that iterates in column order.
286      */
287     public Map<String, Integer> getHeaderMap() {
288         return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
289     }
290 
291     /**
292      * Returns the current record number in the input stream.
293      * <p/>
294      * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the line number.
295      *
296      * @return current line number
297      */
298     public long getRecordNumber() {
299         return this.recordNumber;
300     }
301 
302     /**
303      * Parses the CSV input according to the given format and returns the content as a list of
304      * {@link CSVRecord CSVRecords}.
305      * <p/>
306      * The returned content starts at the current parse-position in the stream.
307      *
308      * @return list of {@link CSVRecord CSVRecords}, may be empty
309      * @throws IOException
310      *             on parse error or input read-failure
311      */
312     public List<CSVRecord> getRecords() throws IOException {
313         final List<CSVRecord> records = new ArrayList<CSVRecord>();
314         CSVRecord rec;
315         while ((rec = this.nextRecord()) != null) {
316             records.add(rec);
317         }
318         return records;
319     }
320 
321     /**
322      * Initializes the name to index mapping if the format defines a header.
323      * 
324      * @return null if the format has no header.
325      */
326     private Map<String, Integer> initializeHeader() throws IOException {
327         Map<String, Integer> hdrMap = null;
328         final String[] formatHeader = this.format.getHeader();
329         if (formatHeader != null) {
330             hdrMap = new LinkedHashMap<String, Integer>();
331 
332             String[] header = null;
333             if (formatHeader.length == 0) {
334                 // read the header from the first line of the file
335                 final CSVRecord nextRecord = this.nextRecord();
336                 if (nextRecord != null) {
337                     header = nextRecord.values();
338                 }
339             } else {
340                 if (this.format.getSkipHeaderRecord()) {
341                     this.nextRecord();
342                 }
343                 header = formatHeader;
344             }
345 
346             // build the name to index mappings
347             if (header != null) {
348                 for (int i = 0; i < header.length; i++) {
349                     hdrMap.put(header[i], Integer.valueOf(i));
350                 }
351             }
352         }
353         return hdrMap;
354     }
355 
356     public boolean isClosed() {
357         return this.lexer.isClosed();
358     }
359 
360     /**
361      * Returns an iterator on the records.
362      *
363      * <p>IOExceptions occurring during the iteration are wrapped in a
364      * RuntimeException.
365      * If the parser is closed a call to {@code next()} will throw a
366      * NoSuchElementException.</p>
367      */
368     public Iterator<CSVRecord> iterator() {
369         return new Iterator<CSVRecord>() {
370             private CSVRecord current;
371 
372             private CSVRecord getNextRecord() {
373                 try {
374                     return CSVParser.this.nextRecord();
375                 } catch (final IOException e) {
376                     // TODO: This is not great, throw an ISE instead?
377                     throw new RuntimeException(e);
378                 }
379             }
380 
381             public boolean hasNext() {
382                 if (CSVParser.this.isClosed()) {
383                     return false;
384                 }
385                 if (this.current == null) {
386                     this.current = this.getNextRecord();
387                 }
388 
389                 return this.current != null;
390             }
391 
392             public CSVRecord next() {
393                 if (CSVParser.this.isClosed()) {
394                     throw new NoSuchElementException("CSVParser has been closed");
395                 }
396                 CSVRecord next = this.current;
397                 this.current = null;
398 
399                 if (next == null) {
400                     // hasNext() wasn't called before
401                     next = this.getNextRecord();
402                     if (next == null) {
403                         throw new NoSuchElementException("No more CSV records available");
404                     }
405                 }
406 
407                 return next;
408             }
409 
410             public void remove() {
411                 throw new UnsupportedOperationException();
412             }
413         };
414     }
415 
416     /**
417      * Parses the next record from the current point in the stream.
418      *
419      * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
420      * @throws IOException
421      *             on parse error or input read-failure
422      */
423     CSVRecord nextRecord() throws IOException {
424         CSVRecord result = null;
425         this.record.clear();
426         StringBuilder sb = null;
427         do {
428             this.reusableToken.reset();
429             this.lexer.nextToken(this.reusableToken);
430             switch (this.reusableToken.type) {
431             case TOKEN:
432                 this.addRecordValue();
433                 break;
434             case EORECORD:
435                 this.addRecordValue();
436                 break;
437             case EOF:
438                 if (this.reusableToken.isReady) {
439                     this.addRecordValue();
440                 }
441                 break;
442             case INVALID:
443                 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
444             case COMMENT: // Ignored currently
445                 if (sb == null) { // first comment for this record
446                     sb = new StringBuilder();
447                 } else {
448                     sb.append(Constants.LF);
449                 }
450                 sb.append(this.reusableToken.content);
451                 this.reusableToken.type = TOKEN; // Read another token
452                 break;
453             }
454         } while (this.reusableToken.type == TOKEN);
455 
456         if (!this.record.isEmpty()) {
457             this.recordNumber++;
458             final String comment = sb == null ? null : sb.toString();
459             result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
460                     this.recordNumber);
461         }
462         return result;
463     }
464 
465 }