View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Token.Type.TOKEN;
21  
22  import java.io.Closeable;
23  import java.io.File;
24  import java.io.FileInputStream;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.InputStreamReader;
28  import java.io.Reader;
29  import java.io.StringReader;
30  import java.net.URL;
31  import java.nio.charset.Charset;
32  import java.nio.file.Files;
33  import java.nio.file.Path;
34  import java.util.ArrayList;
35  import java.util.Arrays;
36  import java.util.Iterator;
37  import java.util.LinkedHashMap;
38  import java.util.List;
39  import java.util.Map;
40  import java.util.NoSuchElementException;
41  import java.util.TreeMap;
42  
43  /**
44   * Parses CSV files according to the specified format.
45   *
46   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
47   * specification of a {@link CSVFormat}.
48   *
49   * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
50   *
51   * <h2>Creating instances</h2>
52   * <p>
53   * There are several static factory methods that can be used to create instances for various types of resources:
54   * </p>
55   * <ul>
56   *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
57   *     <li>{@link #parse(String, CSVFormat)}</li>
58   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
59   * </ul>
60   * <p>
61   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
62   *
63   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
64   * </p>
65   * <pre>
66   * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
67   *     ...
68   * }
69   * </pre>
70   *
71   * <h2>Parsing record wise</h2>
72   * <p>
73   * To parse a CSV input from a file, you write:
74   * </p>
75   *
76   * <pre>
77   * File csvData = new File(&quot;/path/to/csv&quot;);
78   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
79   * for (CSVRecord csvRecord : parser) {
80   *     ...
81   * }
82   * </pre>
83   *
84   * <p>
85   * This will read the parse the contents of the file using the
86   * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
87   * </p>
88   *
89   * <p>
90   * To parse CSV input in a format like Excel, you write:
91   * </p>
92   *
93   * <pre>
94   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
95   * for (CSVRecord csvRecord : parser) {
96   *     ...
97   * }
98   * </pre>
99   *
100  * <p>
101  * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
102  * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
103  * </p>
104  *
105  * <h2>Parsing into memory</h2>
106  * <p>
107  * If parsing record wise is not desired, the contents of the input can be read completely into memory.
108  * </p>
109  *
110  * <pre>
111  * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
112  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
113  * List&lt;CSVRecord&gt; list = parser.getRecords();
114  * </pre>
115  *
116  * <p>
117  * There are two constraints that have to be kept in mind:
118  * </p>
119  *
120  * <ol>
121  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
122  *     the input, those records will not end up in the in memory representation of your CSV data.</li>
123  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
124  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
125  * </ol>
126  *
127  * <h2>Notes</h2>
128  * <p>
129  * Internal parser state is completely covered by the format and the reader-state.
130  * </p>
131  *
132  * @see <a href="package-summary.html">package documentation for more details</a>
133  */
134 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
135 
136     /**
137      * Creates a parser for the given {@link File}.
138      *
139      * @param file
140      *            a CSV file. Must not be null.
141      * @param charset
142      *            A Charset
143      * @param format
144      *            the CSVFormat used for CSV parsing. Must not be null.
145      * @return a new parser
146      * @throws IllegalArgumentException
147      *             If the parameters of the format are inconsistent or if either file or format are null.
148      * @throws IOException
149      *             If an I/O error occurs
150      */
151     @SuppressWarnings("resource")
152     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
153         Assertions.notNull(file, "file");
154         Assertions.notNull(format, "format");
155         return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
156     }
157 
158     /**
159      * Creates a CSV parser using the given {@link CSVFormat}.
160      *
161      * <p>
162      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
163      * unless you close the {@code reader}.
164      * </p>
165      *
166      * @param inputStream
167      *            an InputStream containing CSV-formatted input. Must not be null.
168      * @param charset
169      *            a Charset.
170      * @param format
171      *            the CSVFormat used for CSV parsing. Must not be null.
172      * @return a new CSVParser configured with the given reader and format.
173      * @throws IllegalArgumentException
174      *             If the parameters of the format are inconsistent or if either reader or format are null.
175      * @throws IOException
176      *             If there is a problem reading the header or skipping the first record
177      * @since 1.5
178      */
179     @SuppressWarnings("resource")
180     public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
181             throws IOException {
182         Assertions.notNull(inputStream, "inputStream");
183         Assertions.notNull(format, "format");
184         return parse(new InputStreamReader(inputStream, charset), format);
185     }
186 
187     /**
188      * Creates a parser for the given {@link Path}.
189      *
190      * @param path
191      *            a CSV file. Must not be null.
192      * @param charset
193      *            A Charset
194      * @param format
195      *            the CSVFormat used for CSV parsing. Must not be null.
196      * @return a new parser
197      * @throws IllegalArgumentException
198      *             If the parameters of the format are inconsistent or if either file or format are null.
199      * @throws IOException
200      *             If an I/O error occurs
201      * @since 1.5
202      */
203     public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
204         Assertions.notNull(path, "path");
205         Assertions.notNull(format, "format");
206         return parse(Files.newBufferedReader(path, charset), format);
207     }
208 
209     /**
210      * Creates a CSV parser using the given {@link CSVFormat}
211      *
212      * <p>
213      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
214      * unless you close the {@code reader}.
215      * </p>
216      *
217      * @param reader
218      *            a Reader containing CSV-formatted input. Must not be null.
219      * @param format
220      *            the CSVFormat used for CSV parsing. Must not be null.
221      * @return a new CSVParser configured with the given reader and format.
222      * @throws IllegalArgumentException
223      *             If the parameters of the format are inconsistent or if either reader or format are null.
224      * @throws IOException
225      *             If there is a problem reading the header or skipping the first record
226      * @since 1.5
227      */
228     public static CSVParser parse(Reader reader, final CSVFormat format) throws IOException {
229         return new CSVParser(reader, format);
230     }
231 
232     /**
233      * Creates a parser for the given {@link String}.
234      *
235      * @param string
236      *            a CSV string. Must not be null.
237      * @param format
238      *            the CSVFormat used for CSV parsing. Must not be null.
239      * @return a new parser
240      * @throws IllegalArgumentException
241      *             If the parameters of the format are inconsistent or if either string or format are null.
242      * @throws IOException
243      *             If an I/O error occurs
244      */
245     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
246         Assertions.notNull(string, "string");
247         Assertions.notNull(format, "format");
248 
249         return new CSVParser(new StringReader(string), format);
250     }
251 
252     /**
253      * Creates a parser for the given URL.
254      *
255      * <p>
256      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
257      * you close the {@code url}.
258      * </p>
259      *
260      * @param url
261      *            a URL. Must not be null.
262      * @param charset
263      *            the charset for the resource. Must not be null.
264      * @param format
265      *            the CSVFormat used for CSV parsing. Must not be null.
266      * @return a new parser
267      * @throws IllegalArgumentException
268      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
269      * @throws IOException
270      *             If an I/O error occurs
271      */
272     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
273         Assertions.notNull(url, "url");
274         Assertions.notNull(charset, "charset");
275         Assertions.notNull(format, "format");
276 
277         return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
278     }
279 
280     // the following objects are shared to reduce garbage
281 
282     private final CSVFormat format;
283 
284     /** A mapping of column names to column indices */
285     private final Map<String, Integer> headerMap;
286 
287     private final Lexer lexer;
288 
289     /** A record buffer for getRecord(). Grows as necessary and is reused. */
290     private final List<String> recordList = new ArrayList<>();
291 
292     /**
293      * The next record number to assign.
294      */
295     private long recordNumber;
296 
297     /**
298      * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
299      * with {@link #recordNumber}.
300      */
301     private final long characterOffset;
302 
303     private final Token reusableToken = new Token();
304 
305     /**
306      * Customized CSV parser using the given {@link CSVFormat}
307      *
308      * <p>
309      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
310      * unless you close the {@code reader}.
311      * </p>
312      *
313      * @param reader
314      *            a Reader containing CSV-formatted input. Must not be null.
315      * @param format
316      *            the CSVFormat used for CSV parsing. Must not be null.
317      * @throws IllegalArgumentException
318      *             If the parameters of the format are inconsistent or if either reader or format are null.
319      * @throws IOException
320      *             If there is a problem reading the header or skipping the first record
321      */
322     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
323         this(reader, format, 0, 1);
324     }
325 
326     /**
327      * Customized CSV parser using the given {@link CSVFormat}
328      *
329      * <p>
330      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
331      * unless you close the {@code reader}.
332      * </p>
333      *
334      * @param reader
335      *            a Reader containing CSV-formatted input. Must not be null.
336      * @param format
337      *            the CSVFormat used for CSV parsing. Must not be null.
338      * @param characterOffset
339      *            Lexer offset when the parser does not start parsing at the beginning of the source.
340      * @param recordNumber
341      *            The next record number to assign
342      * @throws IllegalArgumentException
343      *             If the parameters of the format are inconsistent or if either reader or format are null.
344      * @throws IOException
345      *             If there is a problem reading the header or skipping the first record
346      * @since 1.1
347      */
348     @SuppressWarnings("resource")
349     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
350             throws IOException {
351         Assertions.notNull(reader, "reader");
352         Assertions.notNull(format, "format");
353 
354         this.format = format;
355         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
356         this.headerMap = this.initializeHeader();
357         this.characterOffset = characterOffset;
358         this.recordNumber = recordNumber - 1;
359     }
360 
361     private void addRecordValue(final boolean lastRecord) {
362         final String input = this.reusableToken.content.toString();
363         final String inputClean = this.format.getTrim() ? input.trim() : input;
364         if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
365             return;
366         }
367         final String nullString = this.format.getNullString();
368         this.recordList.add(inputClean.equals(nullString) ? null : inputClean);
369     }
370 
371     /**
372      * Closes resources.
373      *
374      * @throws IOException
375      *             If an I/O error occurs
376      */
377     @Override
378     public void close() throws IOException {
379         if (this.lexer != null) {
380             this.lexer.close();
381         }
382     }
383 
384     /**
385      * Returns the current line number in the input stream.
386      *
387      * <p>
388      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
389      * the record number.
390      * </p>
391      *
392      * @return current line number
393      */
394     public long getCurrentLineNumber() {
395         return this.lexer.getCurrentLineNumber();
396     }
397 
398     /**
399      * Gets the first end-of-line string encountered.
400      *
401      * @return the first end-of-line string
402      * @since 1.5
403      */
404     public String getFirstEndOfLine() {
405         return lexer.getFirstEol();
406     }
407 
408     /**
409      * Returns a copy of the header map that iterates in column order.
410      * <p>
411      * The map keys are column names. The map values are 0-based indices.
412      * </p>
413      * @return a copy of the header map that iterates in column order.
414      */
415     public Map<String, Integer> getHeaderMap() {
416         return this.headerMap == null ? null : new LinkedHashMap<>(this.headerMap);
417     }
418 
419     /**
420      * Returns the current record number in the input stream.
421      *
422      * <p>
423      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
424      * the line number.
425      * </p>
426      *
427      * @return current record number
428      */
429     public long getRecordNumber() {
430         return this.recordNumber;
431     }
432 
433     /**
434      * Parses the CSV input according to the given format and returns the content as a list of
435      * {@link CSVRecord CSVRecords}.
436      *
437      * <p>
438      * The returned content starts at the current parse-position in the stream.
439      * </p>
440      *
441      * @return list of {@link CSVRecord CSVRecords}, may be empty
442      * @throws IOException
443      *             on parse error or input read-failure
444      */
445     public List<CSVRecord> getRecords() throws IOException {
446         CSVRecord rec;
447         final List<CSVRecord> records = new ArrayList<>();
448         while ((rec = this.nextRecord()) != null) {
449             records.add(rec);
450         }
451         return records;
452     }
453 
454     /**
455      * Initializes the name to index mapping if the format defines a header.
456      *
457      * @return null if the format has no header.
458      * @throws IOException if there is a problem reading the header or skipping the first record
459      */
460     private Map<String, Integer> initializeHeader() throws IOException {
461         Map<String, Integer> hdrMap = null;
462         final String[] formatHeader = this.format.getHeader();
463         if (formatHeader != null) {
464             hdrMap = this.format.getIgnoreHeaderCase() ?
465                     new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER) :
466                     new LinkedHashMap<String, Integer>();
467 
468             String[] headerRecord = null;
469             if (formatHeader.length == 0) {
470                 // read the header from the first line of the file
471                 final CSVRecord nextRecord = this.nextRecord();
472                 if (nextRecord != null) {
473                     headerRecord = nextRecord.values();
474                 }
475             } else {
476                 if (this.format.getSkipHeaderRecord()) {
477                     this.nextRecord();
478                 }
479                 headerRecord = formatHeader;
480             }
481 
482             // build the name to index mappings
483             if (headerRecord != null) {
484                 for (int i = 0; i < headerRecord.length; i++) {
485                     final String header = headerRecord[i];
486                     final boolean containsHeader = hdrMap.containsKey(header);
487                     final boolean emptyHeader = header == null || header.trim().isEmpty();
488                     if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
489                         throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
490                                 "\" in " + Arrays.toString(headerRecord));
491                     }
492                     hdrMap.put(header, Integer.valueOf(i));
493                 }
494             }
495         }
496         return hdrMap;
497     }
498 
499     /**
500      * Gets whether this parser is closed.
501      *
502      * @return whether this parser is closed.
503      */
504     public boolean isClosed() {
505         return this.lexer.isClosed();
506     }
507 
508     /**
509      * Returns an iterator on the records.
510      *
511      * <p>
512      * An {@link IOException} caught during the iteration are re-thrown as an
513      * {@link IllegalStateException}.
514      * </p>
515      * <p>
516      * If the parser is closed a call to {@link Iterator#next()} will throw a
517      * {@link NoSuchElementException}.
518      * </p>
519      */
520     @Override
521     public Iterator<CSVRecord> iterator() {
522         return new Iterator<CSVRecord>() {
523             private CSVRecord current;
524 
525             private CSVRecord getNextRecord() {
526                 try {
527                     return CSVParser.this.nextRecord();
528                 } catch (final IOException e) {
529                     throw new IllegalStateException(
530                             e.getClass().getSimpleName() + " reading next record: " + e.toString(), e);
531                 }
532             }
533 
534             @Override
535             public boolean hasNext() {
536                 if (CSVParser.this.isClosed()) {
537                     return false;
538                 }
539                 if (this.current == null) {
540                     this.current = this.getNextRecord();
541                 }
542 
543                 return this.current != null;
544             }
545 
546             @Override
547             public CSVRecord next() {
548                 if (CSVParser.this.isClosed()) {
549                     throw new NoSuchElementException("CSVParser has been closed");
550                 }
551                 CSVRecord next = this.current;
552                 this.current = null;
553 
554                 if (next == null) {
555                     // hasNext() wasn't called before
556                     next = this.getNextRecord();
557                     if (next == null) {
558                         throw new NoSuchElementException("No more CSV records available");
559                     }
560                 }
561 
562                 return next;
563             }
564 
565             @Override
566             public void remove() {
567                 throw new UnsupportedOperationException();
568             }
569         };
570     }
571 
572     /**
573      * Parses the next record from the current point in the stream.
574      *
575      * @return the record as an array of values, or {@code null} if the end of the stream has been reached
576      * @throws IOException
577      *             on parse error or input read-failure
578      */
579     CSVRecord nextRecord() throws IOException {
580         CSVRecord result = null;
581         this.recordList.clear();
582         StringBuilder sb = null;
583         final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
584         do {
585             this.reusableToken.reset();
586             this.lexer.nextToken(this.reusableToken);
587             switch (this.reusableToken.type) {
588             case TOKEN:
589                 this.addRecordValue(false);
590                 break;
591             case EORECORD:
592                 this.addRecordValue(true);
593                 break;
594             case EOF:
595                 if (this.reusableToken.isReady) {
596                     this.addRecordValue(true);
597                 }
598                 break;
599             case INVALID:
600                 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
601             case COMMENT: // Ignored currently
602                 if (sb == null) { // first comment for this record
603                     sb = new StringBuilder();
604                 } else {
605                     sb.append(Constants.LF);
606                 }
607                 sb.append(this.reusableToken.content);
608                 this.reusableToken.type = TOKEN; // Read another token
609                 break;
610             default:
611                 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
612             }
613         } while (this.reusableToken.type == TOKEN);
614 
615         if (!this.recordList.isEmpty()) {
616             this.recordNumber++;
617             final String comment = sb == null ? null : sb.toString();
618             result = new CSVRecord(this.recordList.toArray(new String[this.recordList.size()]), this.headerMap, comment,
619                     this.recordNumber, startCharPosition);
620         }
621         return result;
622     }
623 
624 }