View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Token.Type.TOKEN;
21  
22  import java.io.Closeable;
23  import java.io.File;
24  import java.io.FileReader;
25  import java.io.IOException;
26  import java.io.InputStreamReader;
27  import java.io.Reader;
28  import java.io.StringReader;
29  import java.net.URL;
30  import java.nio.charset.Charset;
31  import java.util.ArrayList;
32  import java.util.Arrays;
33  import java.util.Collection;
34  import java.util.Iterator;
35  import java.util.LinkedHashMap;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.NoSuchElementException;
39  
40  /**
41   * Parses CSV files according to the specified format.
42   *
43   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
44   * specification of a {@link CSVFormat}.
45   *
46   * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
47   *
48   * <h2>Creating instances</h2>
49   * <p>
50   * There are several static factory methods that can be used to create instances for various types of resources:
51   * </p>
52   * <ul>
53   *     <li>{@link #parse(java.io.File, CSVFormat)}</li>
54   *     <li>{@link #parse(String, CSVFormat)}</li>
55   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
56   * </ul>
57   * <p>
58   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
59   *
60   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
61   * </p>
62   * <pre>
63   * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
64   *     ...
65   * }
66   * </pre>
67   *
68   * <h2>Parsing record wise</h2>
69   * <p>
70   * To parse a CSV input from a file, you write:
71   * </p>
72   *
73   * <pre>
74   * File csvData = new File(&quot;/path/to/csv&quot;);
75   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
76   * for (CSVRecord csvRecord : parser) {
77   *     ...
78   * }
79   * </pre>
80   *
81   * <p>
82   * This will read the parse the contents of the file using the
83   * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
84   * </p>
85   *
86   * <p>
87   * To parse CSV input in a format like Excel, you write:
88   * </p>
89   *
90   * <pre>
91   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
92   * for (CSVRecord csvRecord : parser) {
93   *     ...
94   * }
95   * </pre>
96   *
97   * <p>
98   * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
99   * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
100  * </p>
101  *
102  * <h2>Parsing into memory</h2>
103  * <p>
104  * If parsing record wise is not desired, the contents of the input can be read completely into memory.
105  * </p>
106  *
107  * <pre>
108  * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
109  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
110  * List&lt;CSVRecord&gt; list = parser.getRecords();
111  * </pre>
112  *
113  * <p>
114  * There are two constraints that have to be kept in mind:
115  * </p>
116  *
117  * <ol>
118  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
119  *     the input, those records will not end up in the in memory representation of your CSV data.</li>
120  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
121  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
122  * </ol>
123  *
124  * <h2>Notes</h2>
125  * <p>
126  * Internal parser state is completely covered by the format and the reader-state.
127  * </p>
128  *
129  * @version $Id: CSVParser.java 1592382 2014-05-04 17:01:07Z britter $
130  *
131  * @see <a href="package-summary.html">package documentation for more details</a>
132  */
133 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
134 
135     /**
136      * Creates a parser for the given {@link File}.
137      *
138      * @param file
139      *            a CSV file. Must not be null.
140      * @param format
141      *            the CSVFormat used for CSV parsing. Must not be null.
142      * @return a new parser
143      * @throws IllegalArgumentException
144      *             If the parameters of the format are inconsistent or if either file or format are null.
145      * @throws IOException
146      *             If an I/O error occurs
147      */
148     public static CSVParser parse(final File file, final CSVFormat format) throws IOException {
149         Assertions.notNull(file, "file");
150         Assertions.notNull(format, "format");
151 
152         return new CSVParser(new FileReader(file), format);
153     }
154 
155     /**
156      * Creates a parser for the given {@link String}.
157      *
158      * @param string
159      *            a CSV string. Must not be null.
160      * @param format
161      *            the CSVFormat used for CSV parsing. Must not be null.
162      * @return a new parser
163      * @throws IllegalArgumentException
164      *             If the parameters of the format are inconsistent or if either string or format are null.
165      * @throws IOException
166      *             If an I/O error occurs
167      */
168     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
169         Assertions.notNull(string, "string");
170         Assertions.notNull(format, "format");
171 
172         return new CSVParser(new StringReader(string), format);
173     }
174 
175     /**
176      * Creates a parser for the given URL.
177      *
178      * <p>
179      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
180      * you close the {@code url}.
181      * </p>
182      *
183      * @param url
184      *            a URL. Must not be null.
185      * @param charset
186      *            the charset for the resource. Must not be null.
187      * @param format
188      *            the CSVFormat used for CSV parsing. Must not be null.
189      * @return a new parser
190      * @throws IllegalArgumentException
191      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
192      * @throws IOException
193      *             If an I/O error occurs
194      */
195     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
196         Assertions.notNull(url, "url");
197         Assertions.notNull(charset, "charset");
198         Assertions.notNull(format, "format");
199 
200         return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
201     }
202 
203     // the following objects are shared to reduce garbage
204 
205     private final CSVFormat format;
206 
207     /** A mapping of column names to column indices */
208     private final Map<String, Integer> headerMap;
209 
210     private final Lexer lexer;
211 
212     /** A record buffer for getRecord(). Grows as necessary and is reused. */
213     private final List<String> record = new ArrayList<String>();
214 
215     private long recordNumber;
216 
217     private final Token reusableToken = new Token();
218 
219     /**
220      * Customized CSV parser using the given {@link CSVFormat}
221      *
222      * <p>
223      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
224      * unless you close the {@code reader}.
225      * </p>
226      *
227      * @param reader
228      *            a Reader containing CSV-formatted input. Must not be null.
229      * @param format
230      *            the CSVFormat used for CSV parsing. Must not be null.
231      * @throws IllegalArgumentException
232      *             If the parameters of the format are inconsistent or if either reader or format are null.
233      * @throws IOException
234      *             If an I/O error occurs
235      */
236     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
237         Assertions.notNull(reader, "reader");
238         Assertions.notNull(format, "format");
239 
240         format.validate();
241         this.format = format;
242         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
243         this.headerMap = this.initializeHeader();
244     }
245 
246     private void addRecordValue() {
247         final String input = this.reusableToken.content.toString();
248         final String nullString = this.format.getNullString();
249         if (nullString == null) {
250             this.record.add(input);
251         } else {
252             this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
253         }
254     }
255 
256     /**
257      * Closes resources.
258      *
259      * @throws IOException
260      *             If an I/O error occurs
261      */
262     public void close() throws IOException {
263         if (this.lexer != null) {
264             this.lexer.close();
265         }
266     }
267 
268     /**
269      * Returns the current line number in the input stream.
270      *
271      * <p>
272      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
273      * the record number.
274      * </p>
275      *
276      * @return current line number
277      */
278     public long getCurrentLineNumber() {
279         return this.lexer.getCurrentLineNumber();
280     }
281 
282     /**
283      * Returns a copy of the header map that iterates in column order.
284      * <p>
285      * The map keys are column names. The map values are 0-based indices.
286      * </p>
287      * @return a copy of the header map that iterates in column order.
288      */
289     public Map<String, Integer> getHeaderMap() {
290         return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
291     }
292 
293     /**
294      * Returns the current record number in the input stream.
295      *
296      * <p>
297      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
298      * the line number.
299      * </p>
300      *
301      * @return current line number
302      */
303     public long getRecordNumber() {
304         return this.recordNumber;
305     }
306 
307     /**
308      * Parses the CSV input according to the given format and returns the content as a list of
309      * {@link CSVRecord CSVRecords}.
310      *
311      * <p>
312      * The returned content starts at the current parse-position in the stream.
313      * </p>
314      *
315      * @return list of {@link CSVRecord CSVRecords}, may be empty
316      * @throws IOException
317      *             on parse error or input read-failure
318      */
319     public List<CSVRecord> getRecords() throws IOException {
320         return getRecords(new ArrayList<CSVRecord>());
321     }
322 
323     /**
324      * Parses the CSV input according to the given format and adds the content to the collection of {@link CSVRecord
325      * CSVRecords}.
326      *
327      * <p>
328      * The returned content starts at the current parse-position in the stream.
329      * </p>
330      *
331      * @param records
332      *            The collection to add to.
333      * @param <T> the type of collection used.
334      * @return a collection of {@link CSVRecord CSVRecords}, may be empty
335      * @throws IOException
336      *             on parse error or input read-failure
337      */
338     public <T extends Collection<CSVRecord>> T getRecords(T records) throws IOException {
339         CSVRecord rec;
340         while ((rec = this.nextRecord()) != null) {
341             records.add(rec);
342         }
343         return records;
344     }
345 
346     /**
347      * Initializes the name to index mapping if the format defines a header.
348      *
349      * @return null if the format has no header.
350      */
351     private Map<String, Integer> initializeHeader() throws IOException {
352         Map<String, Integer> hdrMap = null;
353         final String[] formatHeader = this.format.getHeader();
354         if (formatHeader != null) {
355             hdrMap = new LinkedHashMap<String, Integer>();
356 
357             String[] header = null;
358             if (formatHeader.length == 0) {
359                 // read the header from the first line of the file
360                 final CSVRecord nextRecord = this.nextRecord();
361                 if (nextRecord != null) {
362                     header = nextRecord.values();
363                 }
364             } else {
365                 if (this.format.getSkipHeaderRecord()) {
366                     this.nextRecord();
367                 }
368                 header = formatHeader;
369             }
370 
371             // build the name to index mappings
372             if (header != null) {
373                 for (int i = 0; i < header.length; i++) {
374                     if (hdrMap.containsKey(header[i])) {
375                         throw new IllegalStateException("The header contains duplicate names: "
376                                 + Arrays.toString(header));
377                     }
378                     hdrMap.put(header[i], Integer.valueOf(i));
379                 }
380             }
381         }
382         return hdrMap;
383     }
384 
385     public boolean isClosed() {
386         return this.lexer.isClosed();
387     }
388 
389     /**
390      * Returns an iterator on the records.
391      *
392      * <p>IOExceptions occurring during the iteration are wrapped in a
393      * RuntimeException.
394      * If the parser is closed a call to {@code next()} will throw a
395      * NoSuchElementException.</p>
396      */
397     public Iterator<CSVRecord> iterator() {
398         return new Iterator<CSVRecord>() {
399             private CSVRecord current;
400 
401             private CSVRecord getNextRecord() {
402                 try {
403                     return CSVParser.this.nextRecord();
404                 } catch (final IOException e) {
405                     // TODO: This is not great, throw an ISE instead?
406                     throw new RuntimeException(e);
407                 }
408             }
409 
410             public boolean hasNext() {
411                 if (CSVParser.this.isClosed()) {
412                     return false;
413                 }
414                 if (this.current == null) {
415                     this.current = this.getNextRecord();
416                 }
417 
418                 return this.current != null;
419             }
420 
421             public CSVRecord next() {
422                 if (CSVParser.this.isClosed()) {
423                     throw new NoSuchElementException("CSVParser has been closed");
424                 }
425                 CSVRecord next = this.current;
426                 this.current = null;
427 
428                 if (next == null) {
429                     // hasNext() wasn't called before
430                     next = this.getNextRecord();
431                     if (next == null) {
432                         throw new NoSuchElementException("No more CSV records available");
433                     }
434                 }
435 
436                 return next;
437             }
438 
439             public void remove() {
440                 throw new UnsupportedOperationException();
441             }
442         };
443     }
444 
445     /**
446      * Parses the next record from the current point in the stream.
447      *
448      * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
449      * @throws IOException
450      *             on parse error or input read-failure
451      */
452     CSVRecord nextRecord() throws IOException {
453         CSVRecord result = null;
454         this.record.clear();
455         StringBuilder sb = null;
456         do {
457             this.reusableToken.reset();
458             this.lexer.nextToken(this.reusableToken);
459             switch (this.reusableToken.type) {
460             case TOKEN:
461                 this.addRecordValue();
462                 break;
463             case EORECORD:
464                 this.addRecordValue();
465                 break;
466             case EOF:
467                 if (this.reusableToken.isReady) {
468                     this.addRecordValue();
469                 }
470                 break;
471             case INVALID:
472                 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
473             case COMMENT: // Ignored currently
474                 if (sb == null) { // first comment for this record
475                     sb = new StringBuilder();
476                 } else {
477                     sb.append(Constants.LF);
478                 }
479                 sb.append(this.reusableToken.content);
480                 this.reusableToken.type = TOKEN; // Read another token
481                 break;
482             default:
483                 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
484             }
485         } while (this.reusableToken.type == TOKEN);
486 
487         if (!this.record.isEmpty()) {
488             this.recordNumber++;
489             final String comment = sb == null ? null : sb.toString();
490             result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
491                     this.recordNumber);
492         }
493         return result;
494     }
495 
496 }