View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import java.io.Closeable;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.IOException;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.URL;
28  import java.nio.charset.Charset;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Iterator;
32  import java.util.LinkedHashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.NoSuchElementException;
36  import java.util.TreeMap;
37  
38  import static org.apache.commons.csv.Token.Type.*;
39  
40  /**
41   * Parses CSV files according to the specified format.
42   *
43   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
44   * specification of a {@link CSVFormat}.
45   *
46   * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
47   *
48   * <h2>Creating instances</h2>
49   * <p>
50   * There are several static factory methods that can be used to create instances for various types of resources:
51   * </p>
52   * <ul>
53   *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
54   *     <li>{@link #parse(String, CSVFormat)}</li>
55   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
56   * </ul>
57   * <p>
58   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
59   *
60   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
61   * </p>
62   * <pre>
63   * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
64   *     ...
65   * }
66   * </pre>
67   *
68   * <h2>Parsing record wise</h2>
69   * <p>
70   * To parse a CSV input from a file, you write:
71   * </p>
72   *
73   * <pre>
74   * File csvData = new File(&quot;/path/to/csv&quot;);
75   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
76   * for (CSVRecord csvRecord : parser) {
77   *     ...
78   * }
79   * </pre>
80   *
81   * <p>
82   * This will read the parse the contents of the file using the
83   * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
84   * </p>
85   *
86   * <p>
87   * To parse CSV input in a format like Excel, you write:
88   * </p>
89   *
90   * <pre>
91   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
92   * for (CSVRecord csvRecord : parser) {
93   *     ...
94   * }
95   * </pre>
96   *
97   * <p>
98   * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
99   * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
100  * </p>
101  *
102  * <h2>Parsing into memory</h2>
103  * <p>
104  * If parsing record wise is not desired, the contents of the input can be read completely into memory.
105  * </p>
106  *
107  * <pre>
108  * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
109  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
110  * List&lt;CSVRecord&gt; list = parser.getRecords();
111  * </pre>
112  *
113  * <p>
114  * There are two constraints that have to be kept in mind:
115  * </p>
116  *
117  * <ol>
118  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
119  *     the input, those records will not end up in the in memory representation of your CSV data.</li>
120  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
121  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
122  * </ol>
123  *
124  * <h2>Notes</h2>
125  * <p>
126  * Internal parser state is completely covered by the format and the reader-state.
127  * </p>
128  *
129  * @version $Id: CSVParser.java 1743529 2016-05-12 17:02:05Z ggregory $
130  *
131  * @see <a href="package-summary.html">package documentation for more details</a>
132  */
133 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
134 
135     /**
136      * Creates a parser for the given {@link File}.
137      *
138      * <p><strong>Note:</strong> This method internally creates a FileReader using
139      * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
140      * is executing the code. If this is insufficient create a URL to the file and use
141      * {@link #parse(URL, Charset, CSVFormat)}</p>
142      *
143      * @param file
144      *            a CSV file. Must not be null.
145      * @param charset
146      *            A charset
147      * @param format
148      *            the CSVFormat used for CSV parsing. Must not be null.
149      * @return a new parser
150      * @throws IllegalArgumentException
151      *             If the parameters of the format are inconsistent or if either file or format are null.
152      * @throws IOException
153      *             If an I/O error occurs
154      */
155     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
156         Assertions.notNull(file, "file");
157         Assertions.notNull(format, "format");
158         return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
159     }
160 
161     /**
162      * Creates a parser for the given {@link String}.
163      *
164      * @param string
165      *            a CSV string. Must not be null.
166      * @param format
167      *            the CSVFormat used for CSV parsing. Must not be null.
168      * @return a new parser
169      * @throws IllegalArgumentException
170      *             If the parameters of the format are inconsistent or if either string or format are null.
171      * @throws IOException
172      *             If an I/O error occurs
173      */
174     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
175         Assertions.notNull(string, "string");
176         Assertions.notNull(format, "format");
177 
178         return new CSVParser(new StringReader(string), format);
179     }
180 
181     /**
182      * Creates a parser for the given URL.
183      *
184      * <p>
185      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
186      * you close the {@code url}.
187      * </p>
188      *
189      * @param url
190      *            a URL. Must not be null.
191      * @param charset
192      *            the charset for the resource. Must not be null.
193      * @param format
194      *            the CSVFormat used for CSV parsing. Must not be null.
195      * @return a new parser
196      * @throws IllegalArgumentException
197      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
198      * @throws IOException
199      *             If an I/O error occurs
200      */
201     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
202         Assertions.notNull(url, "url");
203         Assertions.notNull(charset, "charset");
204         Assertions.notNull(format, "format");
205 
206         return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
207     }
208 
209     // the following objects are shared to reduce garbage
210 
211     private final CSVFormat format;
212 
213     /** A mapping of column names to column indices */
214     private final Map<String, Integer> headerMap;
215 
216     private final Lexer lexer;
217 
218     /** A record buffer for getRecord(). Grows as necessary and is reused. */
219     private final List<String> record = new ArrayList<String>();
220 
221     /**
222      * The next record number to assign.
223      */
224     private long recordNumber;
225 
226     /**
227      * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
228      * with {@link #recordNumber}.
229      */
230     private final long characterOffset;
231 
232     private final Token reusableToken = new Token();
233 
234     /**
235      * Customized CSV parser using the given {@link CSVFormat}
236      *
237      * <p>
238      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
239      * unless you close the {@code reader}.
240      * </p>
241      *
242      * @param reader
243      *            a Reader containing CSV-formatted input. Must not be null.
244      * @param format
245      *            the CSVFormat used for CSV parsing. Must not be null.
246      * @throws IllegalArgumentException
247      *             If the parameters of the format are inconsistent or if either reader or format are null.
248      * @throws IOException
249      *             If there is a problem reading the header or skipping the first record
250      */
251     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
252         this(reader, format, 0, 1);
253     }
254 
255     /**
256      * Customized CSV parser using the given {@link CSVFormat}
257      *
258      * <p>
259      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
260      * unless you close the {@code reader}.
261      * </p>
262      *
263      * @param reader
264      *            a Reader containing CSV-formatted input. Must not be null.
265      * @param format
266      *            the CSVFormat used for CSV parsing. Must not be null.
267      * @param characterOffset
268      *            Lexer offset when the parser does not start parsing at the beginning of the source.
269      * @param recordNumber
270      *            The next record number to assign
271      * @throws IllegalArgumentException
272      *             If the parameters of the format are inconsistent or if either reader or format are null.
273      * @throws IOException
274      *             If there is a problem reading the header or skipping the first record
275      * @since 1.1
276      */
277     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
278             throws IOException {
279         Assertions.notNull(reader, "reader");
280         Assertions.notNull(format, "format");
281 
282         this.format = format;
283         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
284         this.headerMap = this.initializeHeader();
285         this.characterOffset = characterOffset;
286         this.recordNumber = recordNumber - 1;
287     }
288 
289     private void addRecordValue(final boolean lastRecord) {
290         final String input = this.reusableToken.content.toString();
291         final String inputClean = this.format.getTrim() ? input.trim() : input;
292         if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
293             return;
294         }
295         final String nullString = this.format.getNullString();
296         this.record.add(inputClean.equals(nullString) ? null : inputClean);
297     }
298 
299     /**
300      * Closes resources.
301      *
302      * @throws IOException
303      *             If an I/O error occurs
304      */
305     @Override
306     public void close() throws IOException {
307         if (this.lexer != null) {
308             this.lexer.close();
309         }
310     }
311 
312     /**
313      * Returns the current line number in the input stream.
314      *
315      * <p>
316      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
317      * the record number.
318      * </p>
319      *
320      * @return current line number
321      */
322     public long getCurrentLineNumber() {
323         return this.lexer.getCurrentLineNumber();
324     }
325 
326     /**
327      * Returns a copy of the header map that iterates in column order.
328      * <p>
329      * The map keys are column names. The map values are 0-based indices.
330      * </p>
331      * @return a copy of the header map that iterates in column order.
332      */
333     public Map<String, Integer> getHeaderMap() {
334         return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
335     }
336 
337     /**
338      * Returns the current record number in the input stream.
339      *
340      * <p>
341      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
342      * the line number.
343      * </p>
344      *
345      * @return current record number
346      */
347     public long getRecordNumber() {
348         return this.recordNumber;
349     }
350 
351     /**
352      * Parses the CSV input according to the given format and returns the content as a list of
353      * {@link CSVRecord CSVRecords}.
354      *
355      * <p>
356      * The returned content starts at the current parse-position in the stream.
357      * </p>
358      *
359      * @return list of {@link CSVRecord CSVRecords}, may be empty
360      * @throws IOException
361      *             on parse error or input read-failure
362      */
363     public List<CSVRecord> getRecords() throws IOException {
364         CSVRecord rec;
365         final List<CSVRecord> records = new ArrayList<CSVRecord>();
366         while ((rec = this.nextRecord()) != null) {
367             records.add(rec);
368         }
369         return records;
370     }
371 
372     /**
373      * Initializes the name to index mapping if the format defines a header.
374      *
375      * @return null if the format has no header.
376      * @throws IOException if there is a problem reading the header or skipping the first record
377      */
378     private Map<String, Integer> initializeHeader() throws IOException {
379         Map<String, Integer> hdrMap = null;
380         final String[] formatHeader = this.format.getHeader();
381         if (formatHeader != null) {
382             hdrMap = this.format.getIgnoreHeaderCase() ?
383                     new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER) :
384                     new LinkedHashMap<String, Integer>();
385 
386             String[] headerRecord = null;
387             if (formatHeader.length == 0) {
388                 // read the header from the first line of the file
389                 final CSVRecord nextRecord = this.nextRecord();
390                 if (nextRecord != null) {
391                     headerRecord = nextRecord.values();
392                 }
393             } else {
394                 if (this.format.getSkipHeaderRecord()) {
395                     this.nextRecord();
396                 }
397                 headerRecord = formatHeader;
398             }
399 
400             // build the name to index mappings
401             if (headerRecord != null) {
402                 for (int i = 0; i < headerRecord.length; i++) {
403                     final String header = headerRecord[i];
404                     final boolean containsHeader = hdrMap.containsKey(header);
405                     final boolean emptyHeader = header == null || header.trim().isEmpty();
406                     if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
407                         throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
408                                 "\" in " + Arrays.toString(headerRecord));
409                     }
410                     hdrMap.put(header, Integer.valueOf(i));
411                 }
412             }
413         }
414         return hdrMap;
415     }
416 
417     /**
418      * Gets whether this parser is closed.
419      *
420      * @return whether this parser is closed.
421      */
422     public boolean isClosed() {
423         return this.lexer.isClosed();
424     }
425 
426     /**
427      * Returns an iterator on the records.
428      *
429      * <p>IOExceptions occurring during the iteration are wrapped in a
430      * RuntimeException.
431      * If the parser is closed a call to {@code next()} will throw a
432      * NoSuchElementException.</p>
433      */
434     @Override
435     public Iterator<CSVRecord> iterator() {
436         return new Iterator<CSVRecord>() {
437             private CSVRecord current;
438 
439             private CSVRecord getNextRecord() {
440                 try {
441                     return CSVParser.this.nextRecord();
442                 } catch (final IOException e) {
443                     // TODO: This is not great, throw an ISE instead?
444                     throw new RuntimeException(e);
445                 }
446             }
447 
448             @Override
449             public boolean hasNext() {
450                 if (CSVParser.this.isClosed()) {
451                     return false;
452                 }
453                 if (this.current == null) {
454                     this.current = this.getNextRecord();
455                 }
456 
457                 return this.current != null;
458             }
459 
460             @Override
461             public CSVRecord next() {
462                 if (CSVParser.this.isClosed()) {
463                     throw new NoSuchElementException("CSVParser has been closed");
464                 }
465                 CSVRecord next = this.current;
466                 this.current = null;
467 
468                 if (next == null) {
469                     // hasNext() wasn't called before
470                     next = this.getNextRecord();
471                     if (next == null) {
472                         throw new NoSuchElementException("No more CSV records available");
473                     }
474                 }
475 
476                 return next;
477             }
478 
479             @Override
480             public void remove() {
481                 throw new UnsupportedOperationException();
482             }
483         };
484     }
485 
486     /**
487      * Parses the next record from the current point in the stream.
488      *
489      * @return the record as an array of values, or {@code null} if the end of the stream has been reached
490      * @throws IOException
491      *             on parse error or input read-failure
492      */
493     CSVRecord nextRecord() throws IOException {
494         CSVRecord result = null;
495         this.record.clear();
496         StringBuilder sb = null;
497         final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
498         do {
499             this.reusableToken.reset();
500             this.lexer.nextToken(this.reusableToken);
501             switch (this.reusableToken.type) {
502             case TOKEN:
503                 this.addRecordValue(false);
504                 break;
505             case EORECORD:
506                 this.addRecordValue(true);
507                 break;
508             case EOF:
509                 if (this.reusableToken.isReady) {
510                     this.addRecordValue(true);
511                 }
512                 break;
513             case INVALID:
514                 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
515             case COMMENT: // Ignored currently
516                 if (sb == null) { // first comment for this record
517                     sb = new StringBuilder();
518                 } else {
519                     sb.append(Constants.LF);
520                 }
521                 sb.append(this.reusableToken.content);
522                 this.reusableToken.type = TOKEN; // Read another token
523                 break;
524             default:
525                 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
526             }
527         } while (this.reusableToken.type == TOKEN);
528 
529         if (!this.record.isEmpty()) {
530             this.recordNumber++;
531             final String comment = sb == null ? null : sb.toString();
532             result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
533                     this.recordNumber, startCharPosition);
534         }
535         return result;
536     }
537 
538 }