View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import java.io.Closeable;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.URL;
28  import java.nio.charset.Charset;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Iterator;
32  import java.util.LinkedHashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.NoSuchElementException;
36  
37  import static org.apache.commons.csv.Token.Type.*;
38  
39  /**
40   * Parses CSV files according to the specified format.
41   *
42   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
43   * specification of a {@link CSVFormat}.
44   *
45   * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
46   *
47   * <h2>Creating instances</h2>
48   * <p>
49   * There are several static factory methods that can be used to create instances for various types of resources:
50   * </p>
51   * <ul>
52   *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
53   *     <li>{@link #parse(String, CSVFormat)}</li>
54   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
55   * </ul>
56   * <p>
57   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
58   *
59   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
60   * </p>
61   * <pre>
62   * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
63   *     ...
64   * }
65   * </pre>
66   *
67   * <h2>Parsing record wise</h2>
68   * <p>
69   * To parse a CSV input from a file, you write:
70   * </p>
71   *
72   * <pre>
73   * File csvData = new File(&quot;/path/to/csv&quot;);
74   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
75   * for (CSVRecord csvRecord : parser) {
76   *     ...
77   * }
78   * </pre>
79   *
80   * <p>
81   * This will read the parse the contents of the file using the
82   * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
83   * </p>
84   *
85   * <p>
86   * To parse CSV input in a format like Excel, you write:
87   * </p>
88   *
89   * <pre>
90   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
91   * for (CSVRecord csvRecord : parser) {
92   *     ...
93   * }
94   * </pre>
95   *
96   * <p>
97   * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
98   * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
99   * </p>
100  *
101  * <h2>Parsing into memory</h2>
102  * <p>
103  * If parsing record wise is not desired, the contents of the input can be read completely into memory.
104  * </p>
105  *
106  * <pre>
107  * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
108  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
109  * List&lt;CSVRecord&gt; list = parser.getRecords();
110  * </pre>
111  *
112  * <p>
113  * There are two constraints that have to be kept in mind:
114  * </p>
115  *
116  * <ol>
117  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
118  *     the input, those records will not end up in the in memory representation of your CSV data.</li>
119  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
120  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
121  * </ol>
122  *
123  * <h2>Notes</h2>
124  * <p>
125  * Internal parser state is completely covered by the format and the reader-state.
126  * </p>
127  *
128  * @version $Id: CSVParser.java 1617069 2014-08-10 08:53:42Z britter $
129  *
130  * @see <a href="package-summary.html">package documentation for more details</a>
131  */
132 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
133 
134     /**
135      * Creates a parser for the given {@link File}.
136      *
137      * <p><strong>Note:</strong> This method internally creates a FileReader using
138      * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
139      * is executing the code. If this is insufficient create a URL to the file and use
140      * {@link #parse(URL, Charset, CSVFormat)}</p>
141      *
142      * @param file
143      *            a CSV file. Must not be null.
144      * @param charset
145      *            A charset
146      * @param format
147      *            the CSVFormat used for CSV parsing. Must not be null.
148      * @return a new parser
149      * @throws IllegalArgumentException
150      *             If the parameters of the format are inconsistent or if either file or format are null.
151      * @throws IOException
152      *             If an I/O error occurs
153      */
154     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
155         Assertions.notNull(file, "file");
156         Assertions.notNull(format, "format");
157         // Use the default Charset explicitly
158         return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
159     }
160 
161     /**
162      * Creates a parser for the given {@link String}.
163      *
164      * @param string
165      *            a CSV string. Must not be null.
166      * @param format
167      *            the CSVFormat used for CSV parsing. Must not be null.
168      * @return a new parser
169      * @throws IllegalArgumentException
170      *             If the parameters of the format are inconsistent or if either string or format are null.
171      * @throws IOException
172      *             If an I/O error occurs
173      */
174     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
175         Assertions.notNull(string, "string");
176         Assertions.notNull(format, "format");
177 
178         return new CSVParser(new StringReader(string), format);
179     }
180 
181     /**
182      * Creates a parser for the given URL.
183      *
184      * <p>
185      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
186      * you close the {@code url}.
187      * </p>
188      *
189      * @param url
190      *            a URL. Must not be null.
191      * @param charset
192      *            the charset for the resource. Must not be null.
193      * @param format
194      *            the CSVFormat used for CSV parsing. Must not be null.
195      * @return a new parser
196      * @throws IllegalArgumentException
197      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
198      * @throws IOException
199      *             If an I/O error occurs
200      */
201     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
202         Assertions.notNull(url, "url");
203         Assertions.notNull(charset, "charset");
204         Assertions.notNull(format, "format");
205 
206         return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
207     }
208 
209     // the following objects are shared to reduce garbage
210 
211     private final CSVFormat format;
212 
213     /** A mapping of column names to column indices */
214     private final Map<String, Integer> headerMap;
215 
216     private final Lexer lexer;
217 
218     /** A record buffer for getRecord(). Grows as necessary and is reused. */
219     private final List<String> record = new ArrayList<String>();
220 
221     private long recordNumber;
222 
223     private final Token reusableToken = new Token();
224 
225     /**
226      * Customized CSV parser using the given {@link CSVFormat}
227      *
228      * <p>
229      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
230      * unless you close the {@code reader}.
231      * </p>
232      *
233      * @param reader
234      *            a Reader containing CSV-formatted input. Must not be null.
235      * @param format
236      *            the CSVFormat used for CSV parsing. Must not be null.
237      * @throws IllegalArgumentException
238      *             If the parameters of the format are inconsistent or if either reader or format are null.
239      * @throws IOException
240      *             If there is a problem reading the header or skipping the first record
241      */
242     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
243         Assertions.notNull(reader, "reader");
244         Assertions.notNull(format, "format");
245 
246         this.format = format;
247         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
248         this.headerMap = this.initializeHeader();
249     }
250 
251     private void addRecordValue() {
252         final String input = this.reusableToken.content.toString();
253         final String nullString = this.format.getNullString();
254         if (nullString == null) {
255             this.record.add(input);
256         } else {
257             this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
258         }
259     }
260 
261     /**
262      * Closes resources.
263      *
264      * @throws IOException
265      *             If an I/O error occurs
266      */
267     public void close() throws IOException {
268         if (this.lexer != null) {
269             this.lexer.close();
270         }
271     }
272 
273     /**
274      * Returns the current line number in the input stream.
275      *
276      * <p>
277      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
278      * the record number.
279      * </p>
280      *
281      * @return current line number
282      */
283     public long getCurrentLineNumber() {
284         return this.lexer.getCurrentLineNumber();
285     }
286 
287     /**
288      * Returns a copy of the header map that iterates in column order.
289      * <p>
290      * The map keys are column names. The map values are 0-based indices.
291      * </p>
292      * @return a copy of the header map that iterates in column order.
293      */
294     public Map<String, Integer> getHeaderMap() {
295         return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
296     }
297 
298     /**
299      * Returns the current record number in the input stream.
300      *
301      * <p>
302      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
303      * the line number.
304      * </p>
305      *
306      * @return current line number
307      */
308     public long getRecordNumber() {
309         return this.recordNumber;
310     }
311 
312     /**
313      * Parses the CSV input according to the given format and returns the content as a list of
314      * {@link CSVRecord CSVRecords}.
315      *
316      * <p>
317      * The returned content starts at the current parse-position in the stream.
318      * </p>
319      *
320      * @return list of {@link CSVRecord CSVRecords}, may be empty
321      * @throws IOException
322      *             on parse error or input read-failure
323      */
324     public List<CSVRecord> getRecords() throws IOException {
325         CSVRecord rec;
326         List<CSVRecord> records = new ArrayList<CSVRecord>();
327         while ((rec = this.nextRecord()) != null) {
328             records.add(rec);
329         }
330         return records;
331     }
332 
333     /**
334      * Initializes the name to index mapping if the format defines a header.
335      *
336      * @return null if the format has no header.
337      * @throws IOException if there is a problem reading the header or skipping the first record
338      */
339     private Map<String, Integer> initializeHeader() throws IOException {
340         Map<String, Integer> hdrMap = null;
341         final String[] formatHeader = this.format.getHeader();
342         if (formatHeader != null) {
343             hdrMap = new LinkedHashMap<String, Integer>();
344 
345             String[] headerRecord = null;
346             if (formatHeader.length == 0) {
347                 // read the header from the first line of the file
348                 final CSVRecord nextRecord = this.nextRecord();
349                 if (nextRecord != null) {
350                     headerRecord = nextRecord.values();
351                 }
352             } else {
353                 if (this.format.getSkipHeaderRecord()) {
354                     this.nextRecord();
355                 }
356                 headerRecord = formatHeader;
357             }
358 
359             // build the name to index mappings
360             if (headerRecord != null) {
361                 for (int i = 0; i < headerRecord.length; i++) {
362                     final String header = headerRecord[i];
363                     final boolean containsHeader = hdrMap.containsKey(header);
364                     final boolean emptyHeader = header == null || header.trim().isEmpty();
365                     if (containsHeader &&
366                             (!emptyHeader || (emptyHeader && !this.format.getAllowMissingColumnNames()))) {
367                         throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
368                                 "\" in " + Arrays.toString(headerRecord));
369                     }
370                     hdrMap.put(header, Integer.valueOf(i));
371                 }
372             }
373         }
374         return hdrMap;
375     }
376 
377     public boolean isClosed() {
378         return this.lexer.isClosed();
379     }
380 
381     /**
382      * Returns an iterator on the records.
383      *
384      * <p>IOExceptions occurring during the iteration are wrapped in a
385      * RuntimeException.
386      * If the parser is closed a call to {@code next()} will throw a
387      * NoSuchElementException.</p>
388      */
389     public Iterator<CSVRecord> iterator() {
390         return new Iterator<CSVRecord>() {
391             private CSVRecord current;
392 
393             private CSVRecord getNextRecord() {
394                 try {
395                     return CSVParser.this.nextRecord();
396                 } catch (final IOException e) {
397                     // TODO: This is not great, throw an ISE instead?
398                     throw new RuntimeException(e);
399                 }
400             }
401 
402             public boolean hasNext() {
403                 if (CSVParser.this.isClosed()) {
404                     return false;
405                 }
406                 if (this.current == null) {
407                     this.current = this.getNextRecord();
408                 }
409 
410                 return this.current != null;
411             }
412 
413             public CSVRecord next() {
414                 if (CSVParser.this.isClosed()) {
415                     throw new NoSuchElementException("CSVParser has been closed");
416                 }
417                 CSVRecord next = this.current;
418                 this.current = null;
419 
420                 if (next == null) {
421                     // hasNext() wasn't called before
422                     next = this.getNextRecord();
423                     if (next == null) {
424                         throw new NoSuchElementException("No more CSV records available");
425                     }
426                 }
427 
428                 return next;
429             }
430 
431             public void remove() {
432                 throw new UnsupportedOperationException();
433             }
434         };
435     }
436 
437     /**
438      * Parses the next record from the current point in the stream.
439      *
440      * @return the record as an array of values, or {@code null} if the end of the stream has been reached
441      * @throws IOException
442      *             on parse error or input read-failure
443      */
444     CSVRecord nextRecord() throws IOException {
445         CSVRecord result = null;
446         this.record.clear();
447         StringBuilder sb = null;
448         do {
449             this.reusableToken.reset();
450             this.lexer.nextToken(this.reusableToken);
451             switch (this.reusableToken.type) {
452             case TOKEN:
453                 this.addRecordValue();
454                 break;
455             case EORECORD:
456                 this.addRecordValue();
457                 break;
458             case EOF:
459                 if (this.reusableToken.isReady) {
460                     this.addRecordValue();
461                 }
462                 break;
463             case INVALID:
464                 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
465             case COMMENT: // Ignored currently
466                 if (sb == null) { // first comment for this record
467                     sb = new StringBuilder();
468                 } else {
469                     sb.append(Constants.LF);
470                 }
471                 sb.append(this.reusableToken.content);
472                 this.reusableToken.type = TOKEN; // Read another token
473                 break;
474             default:
475                 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
476             }
477         } while (this.reusableToken.type == TOKEN);
478 
479         if (!this.record.isEmpty()) {
480             this.recordNumber++;
481             final String comment = sb == null ? null : sb.toString();
482             result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
483                     this.recordNumber);
484         }
485         return result;
486     }
487 
488 }