View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import java.io.Closeable;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.URL;
28  import java.nio.charset.Charset;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Iterator;
32  import java.util.LinkedHashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.NoSuchElementException;
36  
37  import static org.apache.commons.csv.Token.Type.*;
38  
39  /**
40   * Parses CSV files according to the specified format.
41   *
42   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
43   * specification of a {@link CSVFormat}.
44   *
45   * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
46   *
47   * <h2>Creating instances</h2>
48   * <p>
49   * There are several static factory methods that can be used to create instances for various types of resources:
50   * </p>
51   * <ul>
52   *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
53   *     <li>{@link #parse(String, CSVFormat)}</li>
54   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
55   * </ul>
56   * <p>
57   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
58   *
59   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
60   * </p>
61   * <pre>
62   * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
63   *     ...
64   * }
65   * </pre>
66   *
67   * <h2>Parsing record wise</h2>
68   * <p>
69   * To parse a CSV input from a file, you write:
70   * </p>
71   *
72   * <pre>
73   * File csvData = new File(&quot;/path/to/csv&quot;);
74   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
75   * for (CSVRecord csvRecord : parser) {
76   *     ...
77   * }
78   * </pre>
79   *
80   * <p>
81   * This will read the parse the contents of the file using the
82   * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
83   * </p>
84   *
85   * <p>
86   * To parse CSV input in a format like Excel, you write:
87   * </p>
88   *
89   * <pre>
90   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
91   * for (CSVRecord csvRecord : parser) {
92   *     ...
93   * }
94   * </pre>
95   *
96   * <p>
97   * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
98   * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
99   * </p>
100  *
101  * <h2>Parsing into memory</h2>
102  * <p>
103  * If parsing record wise is not desired, the contents of the input can be read completely into memory.
104  * </p>
105  *
106  * <pre>
107  * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
108  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
109  * List&lt;CSVRecord&gt; list = parser.getRecords();
110  * </pre>
111  *
112  * <p>
113  * There are two constraints that have to be kept in mind:
114  * </p>
115  *
116  * <ol>
117  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
118  *     the input, those records will not end up in the in memory representation of your CSV data.</li>
119  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
120  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
121  * </ol>
122  *
123  * <h2>Notes</h2>
124  * <p>
125  * Internal parser state is completely covered by the format and the reader-state.
126  * </p>
127  *
128  * @version $Id: CSVParser.java 1637611 2014-11-08 23:38:48Z ggregory $
129  *
130  * @see <a href="package-summary.html">package documentation for more details</a>
131  */
132 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
133 
134     /**
135      * Creates a parser for the given {@link File}.
136      *
137      * <p><strong>Note:</strong> This method internally creates a FileReader using
138      * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
139      * is executing the code. If this is insufficient create a URL to the file and use
140      * {@link #parse(URL, Charset, CSVFormat)}</p>
141      *
142      * @param file
143      *            a CSV file. Must not be null.
144      * @param charset
145      *            A charset
146      * @param format
147      *            the CSVFormat used for CSV parsing. Must not be null.
148      * @return a new parser
149      * @throws IllegalArgumentException
150      *             If the parameters of the format are inconsistent or if either file or format are null.
151      * @throws IOException
152      *             If an I/O error occurs
153      */
154     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
155         Assertions.notNull(file, "file");
156         Assertions.notNull(format, "format");
157         return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
158     }
159 
160     /**
161      * Creates a parser for the given {@link String}.
162      *
163      * @param string
164      *            a CSV string. Must not be null.
165      * @param format
166      *            the CSVFormat used for CSV parsing. Must not be null.
167      * @return a new parser
168      * @throws IllegalArgumentException
169      *             If the parameters of the format are inconsistent or if either string or format are null.
170      * @throws IOException
171      *             If an I/O error occurs
172      */
173     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
174         Assertions.notNull(string, "string");
175         Assertions.notNull(format, "format");
176 
177         return new CSVParser(new StringReader(string), format);
178     }
179 
180     /**
181      * Creates a parser for the given URL.
182      *
183      * <p>
184      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
185      * you close the {@code url}.
186      * </p>
187      *
188      * @param url
189      *            a URL. Must not be null.
190      * @param charset
191      *            the charset for the resource. Must not be null.
192      * @param format
193      *            the CSVFormat used for CSV parsing. Must not be null.
194      * @return a new parser
195      * @throws IllegalArgumentException
196      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
197      * @throws IOException
198      *             If an I/O error occurs
199      */
200     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
201         Assertions.notNull(url, "url");
202         Assertions.notNull(charset, "charset");
203         Assertions.notNull(format, "format");
204 
205         return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
206     }
207 
208     // the following objects are shared to reduce garbage
209 
210     private final CSVFormat format;
211 
212     /** A mapping of column names to column indices */
213     private final Map<String, Integer> headerMap;
214 
215     private final Lexer lexer;
216 
217     /** A record buffer for getRecord(). Grows as necessary and is reused. */
218     private final List<String> record = new ArrayList<String>();
219 
220     /**
221      * The next record number to assign.
222      */
223     private long recordNumber;
224 
225     /**
226      * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
227      * with {@link #recordNumber}.
228      */
229     private final long characterOffset;
230 
231     private final Token reusableToken = new Token();
232 
233     /**
234      * Customized CSV parser using the given {@link CSVFormat}
235      *
236      * <p>
237      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
238      * unless you close the {@code reader}.
239      * </p>
240      *
241      * @param reader
242      *            a Reader containing CSV-formatted input. Must not be null.
243      * @param format
244      *            the CSVFormat used for CSV parsing. Must not be null.
245      * @throws IllegalArgumentException
246      *             If the parameters of the format are inconsistent or if either reader or format are null.
247      * @throws IOException
248      *             If there is a problem reading the header or skipping the first record
249      */
250     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
251         this(reader, format, 0, 1);
252     }
253 
254     /**
255      * Customized CSV parser using the given {@link CSVFormat}
256      *
257      * <p>
258      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
259      * unless you close the {@code reader}.
260      * </p>
261      *
262      * @param reader
263      *            a Reader containing CSV-formatted input. Must not be null.
264      * @param format
265      *            the CSVFormat used for CSV parsing. Must not be null.
266      * @param characterOffset
267      *            Lexer offset when the parser does not start parsing at the beginning of the source.
268      * @param recordNumber
269      *            The next record number to assign
270      * @throws IllegalArgumentException
271      *             If the parameters of the format are inconsistent or if either reader or format are null.
272      * @throws IOException
273      *             If there is a problem reading the header or skipping the first record
274      * @since 1.1
275      */
276     public CSVParser(final Reader reader, final CSVFormat format, long characterOffset, long recordNumber)
277             throws IOException {
278         Assertions.notNull(reader, "reader");
279         Assertions.notNull(format, "format");
280 
281         this.format = format;
282         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
283         this.headerMap = this.initializeHeader();
284         this.characterOffset = characterOffset;
285         this.recordNumber = recordNumber - 1;
286     }
287 
288     private void addRecordValue() {
289         final String input = this.reusableToken.content.toString();
290         final String nullString = this.format.getNullString();
291         if (nullString == null) {
292             this.record.add(input);
293         } else {
294             this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
295         }
296     }
297 
298     /**
299      * Closes resources.
300      *
301      * @throws IOException
302      *             If an I/O error occurs
303      */
304     @Override
305     public void close() throws IOException {
306         if (this.lexer != null) {
307             this.lexer.close();
308         }
309     }
310 
311     /**
312      * Returns the current line number in the input stream.
313      *
314      * <p>
315      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
316      * the record number.
317      * </p>
318      *
319      * @return current line number
320      */
321     public long getCurrentLineNumber() {
322         return this.lexer.getCurrentLineNumber();
323     }
324 
325     /**
326      * Returns a copy of the header map that iterates in column order.
327      * <p>
328      * The map keys are column names. The map values are 0-based indices.
329      * </p>
330      * @return a copy of the header map that iterates in column order.
331      */
332     public Map<String, Integer> getHeaderMap() {
333         return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
334     }
335 
336     /**
337      * Returns the current record number in the input stream.
338      *
339      * <p>
340      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
341      * the line number.
342      * </p>
343      *
344      * @return current record number
345      */
346     public long getRecordNumber() {
347         return this.recordNumber;
348     }
349 
350     /**
351      * Parses the CSV input according to the given format and returns the content as a list of
352      * {@link CSVRecord CSVRecords}.
353      *
354      * <p>
355      * The returned content starts at the current parse-position in the stream.
356      * </p>
357      *
358      * @return list of {@link CSVRecord CSVRecords}, may be empty
359      * @throws IOException
360      *             on parse error or input read-failure
361      */
362     public List<CSVRecord> getRecords() throws IOException {
363         CSVRecord rec;
364         List<CSVRecord> records = new ArrayList<CSVRecord>();
365         while ((rec = this.nextRecord()) != null) {
366             records.add(rec);
367         }
368         return records;
369     }
370 
371     /**
372      * Initializes the name to index mapping if the format defines a header.
373      *
374      * @return null if the format has no header.
375      * @throws IOException if there is a problem reading the header or skipping the first record
376      */
377     private Map<String, Integer> initializeHeader() throws IOException {
378         Map<String, Integer> hdrMap = null;
379         final String[] formatHeader = this.format.getHeader();
380         if (formatHeader != null) {
381             hdrMap = new LinkedHashMap<String, Integer>();
382 
383             String[] headerRecord = null;
384             if (formatHeader.length == 0) {
385                 // read the header from the first line of the file
386                 final CSVRecord nextRecord = this.nextRecord();
387                 if (nextRecord != null) {
388                     headerRecord = nextRecord.values();
389                 }
390             } else {
391                 if (this.format.getSkipHeaderRecord()) {
392                     this.nextRecord();
393                 }
394                 headerRecord = formatHeader;
395             }
396 
397             // build the name to index mappings
398             if (headerRecord != null) {
399                 for (int i = 0; i < headerRecord.length; i++) {
400                     final String header = headerRecord[i];
401                     final boolean containsHeader = hdrMap.containsKey(header);
402                     final boolean emptyHeader = header == null || header.trim().isEmpty();
403                     if (containsHeader &&
404                             (!emptyHeader || (emptyHeader && !this.format.getAllowMissingColumnNames()))) {
405                         throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
406                                 "\" in " + Arrays.toString(headerRecord));
407                     }
408                     hdrMap.put(header, Integer.valueOf(i));
409                 }
410             }
411         }
412         return hdrMap;
413     }
414 
415     /**
416      * Gets whether this parser is closed.
417      *
418      * @return whether this parser is closed.
419      */
420     public boolean isClosed() {
421         return this.lexer.isClosed();
422     }
423 
424     /**
425      * Returns an iterator on the records.
426      *
427      * <p>IOExceptions occurring during the iteration are wrapped in a
428      * RuntimeException.
429      * If the parser is closed a call to {@code next()} will throw a
430      * NoSuchElementException.</p>
431      */
432     @Override
433     public Iterator<CSVRecord> iterator() {
434         return new Iterator<CSVRecord>() {
435             private CSVRecord current;
436 
437             private CSVRecord getNextRecord() {
438                 try {
439                     return CSVParser.this.nextRecord();
440                 } catch (final IOException e) {
441                     // TODO: This is not great, throw an ISE instead?
442                     throw new RuntimeException(e);
443                 }
444             }
445 
446             @Override
447             public boolean hasNext() {
448                 if (CSVParser.this.isClosed()) {
449                     return false;
450                 }
451                 if (this.current == null) {
452                     this.current = this.getNextRecord();
453                 }
454 
455                 return this.current != null;
456             }
457 
458             @Override
459             public CSVRecord next() {
460                 if (CSVParser.this.isClosed()) {
461                     throw new NoSuchElementException("CSVParser has been closed");
462                 }
463                 CSVRecord next = this.current;
464                 this.current = null;
465 
466                 if (next == null) {
467                     // hasNext() wasn't called before
468                     next = this.getNextRecord();
469                     if (next == null) {
470                         throw new NoSuchElementException("No more CSV records available");
471                     }
472                 }
473 
474                 return next;
475             }
476 
477             @Override
478             public void remove() {
479                 throw new UnsupportedOperationException();
480             }
481         };
482     }
483 
484     /**
485      * Parses the next record from the current point in the stream.
486      *
487      * @return the record as an array of values, or {@code null} if the end of the stream has been reached
488      * @throws IOException
489      *             on parse error or input read-failure
490      */
491     CSVRecord nextRecord() throws IOException {
492         CSVRecord result = null;
493         this.record.clear();
494         StringBuilder sb = null;
495         final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
496         do {
497             this.reusableToken.reset();
498             this.lexer.nextToken(this.reusableToken);
499             switch (this.reusableToken.type) {
500             case TOKEN:
501                 this.addRecordValue();
502                 break;
503             case EORECORD:
504                 this.addRecordValue();
505                 break;
506             case EOF:
507                 if (this.reusableToken.isReady) {
508                     this.addRecordValue();
509                 }
510                 break;
511             case INVALID:
512                 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
513             case COMMENT: // Ignored currently
514                 if (sb == null) { // first comment for this record
515                     sb = new StringBuilder();
516                 } else {
517                     sb.append(Constants.LF);
518                 }
519                 sb.append(this.reusableToken.content);
520                 this.reusableToken.type = TOKEN; // Read another token
521                 break;
522             default:
523                 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
524             }
525         } while (this.reusableToken.type == TOKEN);
526 
527         if (!this.record.isEmpty()) {
528             this.recordNumber++;
529             final String comment = sb == null ? null : sb.toString();
530             result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
531                     this.recordNumber, startCharPosition);
532         }
533         return result;
534     }
535 
536 }