001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import static org.apache.commons.csv.Token.Type.TOKEN;
021
022import java.io.Closeable;
023import java.io.File;
024import java.io.FileReader;
025import java.io.IOException;
026import java.io.InputStreamReader;
027import java.io.Reader;
028import java.io.StringReader;
029import java.net.URL;
030import java.nio.charset.Charset;
031import java.util.ArrayList;
032import java.util.Arrays;
033import java.util.Collection;
034import java.util.Iterator;
035import java.util.LinkedHashMap;
036import java.util.List;
037import java.util.Map;
038import java.util.NoSuchElementException;
039
040/**
041 * Parses CSV files according to the specified format.
042 *
043 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
044 * specification of a {@link CSVFormat}.
045 *
046 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
047 *
048 * <h2>Creating instances</h2>
049 * <p>
050 * There are several static factory methods that can be used to create instances for various types of resources:
051 * </p>
052 * <ul>
053 *     <li>{@link #parse(java.io.File, CSVFormat)}</li>
054 *     <li>{@link #parse(String, CSVFormat)}</li>
055 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
056 * </ul>
057 * <p>
058 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
059 *
060 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
061 * </p>
062 * <pre>
063 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
064 *     ...
065 * }
066 * </pre>
067 *
068 * <h2>Parsing record wise</h2>
069 * <p>
070 * To parse a CSV input from a file, you write:
071 * </p>
072 *
073 * <pre>
074 * File csvData = new File(&quot;/path/to/csv&quot;);
075 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
076 * for (CSVRecord csvRecord : parser) {
077 *     ...
078 * }
079 * </pre>
080 *
081 * <p>
082 * This will read the parse the contents of the file using the
083 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
084 * </p>
085 *
086 * <p>
087 * To parse CSV input in a format like Excel, you write:
088 * </p>
089 *
090 * <pre>
091 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
092 * for (CSVRecord csvRecord : parser) {
093 *     ...
094 * }
095 * </pre>
096 *
097 * <p>
098 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
099 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
100 * </p>
101 *
102 * <h2>Parsing into memory</h2>
103 * <p>
104 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
105 * </p>
106 *
107 * <pre>
108 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
109 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
110 * List&lt;CSVRecord&gt; list = parser.getRecords();
111 * </pre>
112 *
113 * <p>
114 * There are two constraints that have to be kept in mind:
115 * </p>
116 *
117 * <ol>
118 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
119 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
120 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
121 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
122 * </ol>
123 *
124 * <h2>Notes</h2>
125 * <p>
126 * Internal parser state is completely covered by the format and the reader-state.
127 * </p>
128 *
129 * @version $Id: CSVParser.java 1592382 2014-05-04 17:01:07Z britter $
130 *
131 * @see <a href="package-summary.html">package documentation for more details</a>
132 */
133public final class CSVParser implements Iterable<CSVRecord>, Closeable {
134
135    /**
136     * Creates a parser for the given {@link File}.
137     *
138     * @param file
139     *            a CSV file. Must not be null.
140     * @param format
141     *            the CSVFormat used for CSV parsing. Must not be null.
142     * @return a new parser
143     * @throws IllegalArgumentException
144     *             If the parameters of the format are inconsistent or if either file or format are null.
145     * @throws IOException
146     *             If an I/O error occurs
147     */
148    public static CSVParser parse(final File file, final CSVFormat format) throws IOException {
149        Assertions.notNull(file, "file");
150        Assertions.notNull(format, "format");
151
152        return new CSVParser(new FileReader(file), format);
153    }
154
155    /**
156     * Creates a parser for the given {@link String}.
157     *
158     * @param string
159     *            a CSV string. Must not be null.
160     * @param format
161     *            the CSVFormat used for CSV parsing. Must not be null.
162     * @return a new parser
163     * @throws IllegalArgumentException
164     *             If the parameters of the format are inconsistent or if either string or format are null.
165     * @throws IOException
166     *             If an I/O error occurs
167     */
168    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
169        Assertions.notNull(string, "string");
170        Assertions.notNull(format, "format");
171
172        return new CSVParser(new StringReader(string), format);
173    }
174
175    /**
176     * Creates a parser for the given URL.
177     *
178     * <p>
179     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
180     * you close the {@code url}.
181     * </p>
182     *
183     * @param url
184     *            a URL. Must not be null.
185     * @param charset
186     *            the charset for the resource. Must not be null.
187     * @param format
188     *            the CSVFormat used for CSV parsing. Must not be null.
189     * @return a new parser
190     * @throws IllegalArgumentException
191     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
192     * @throws IOException
193     *             If an I/O error occurs
194     */
195    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
196        Assertions.notNull(url, "url");
197        Assertions.notNull(charset, "charset");
198        Assertions.notNull(format, "format");
199
200        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
201    }
202
203    // the following objects are shared to reduce garbage
204
205    private final CSVFormat format;
206
207    /** A mapping of column names to column indices */
208    private final Map<String, Integer> headerMap;
209
210    private final Lexer lexer;
211
212    /** A record buffer for getRecord(). Grows as necessary and is reused. */
213    private final List<String> record = new ArrayList<String>();
214
215    private long recordNumber;
216
217    private final Token reusableToken = new Token();
218
219    /**
220     * Customized CSV parser using the given {@link CSVFormat}
221     *
222     * <p>
223     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
224     * unless you close the {@code reader}.
225     * </p>
226     *
227     * @param reader
228     *            a Reader containing CSV-formatted input. Must not be null.
229     * @param format
230     *            the CSVFormat used for CSV parsing. Must not be null.
231     * @throws IllegalArgumentException
232     *             If the parameters of the format are inconsistent or if either reader or format are null.
233     * @throws IOException
234     *             If an I/O error occurs
235     */
236    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
237        Assertions.notNull(reader, "reader");
238        Assertions.notNull(format, "format");
239
240        format.validate();
241        this.format = format;
242        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
243        this.headerMap = this.initializeHeader();
244    }
245
246    private void addRecordValue() {
247        final String input = this.reusableToken.content.toString();
248        final String nullString = this.format.getNullString();
249        if (nullString == null) {
250            this.record.add(input);
251        } else {
252            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
253        }
254    }
255
256    /**
257     * Closes resources.
258     *
259     * @throws IOException
260     *             If an I/O error occurs
261     */
262    public void close() throws IOException {
263        if (this.lexer != null) {
264            this.lexer.close();
265        }
266    }
267
268    /**
269     * Returns the current line number in the input stream.
270     *
271     * <p>
272     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
273     * the record number.
274     * </p>
275     *
276     * @return current line number
277     */
278    public long getCurrentLineNumber() {
279        return this.lexer.getCurrentLineNumber();
280    }
281
282    /**
283     * Returns a copy of the header map that iterates in column order.
284     * <p>
285     * The map keys are column names. The map values are 0-based indices.
286     * </p>
287     * @return a copy of the header map that iterates in column order.
288     */
289    public Map<String, Integer> getHeaderMap() {
290        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
291    }
292
293    /**
294     * Returns the current record number in the input stream.
295     *
296     * <p>
297     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
298     * the line number.
299     * </p>
300     *
301     * @return current line number
302     */
303    public long getRecordNumber() {
304        return this.recordNumber;
305    }
306
307    /**
308     * Parses the CSV input according to the given format and returns the content as a list of
309     * {@link CSVRecord CSVRecords}.
310     *
311     * <p>
312     * The returned content starts at the current parse-position in the stream.
313     * </p>
314     *
315     * @return list of {@link CSVRecord CSVRecords}, may be empty
316     * @throws IOException
317     *             on parse error or input read-failure
318     */
319    public List<CSVRecord> getRecords() throws IOException {
320        return getRecords(new ArrayList<CSVRecord>());
321    }
322
323    /**
324     * Parses the CSV input according to the given format and adds the content to the collection of {@link CSVRecord
325     * CSVRecords}.
326     *
327     * <p>
328     * The returned content starts at the current parse-position in the stream.
329     * </p>
330     *
331     * @param records
332     *            The collection to add to.
333     * @param <T> the type of collection used.
334     * @return a collection of {@link CSVRecord CSVRecords}, may be empty
335     * @throws IOException
336     *             on parse error or input read-failure
337     */
338    public <T extends Collection<CSVRecord>> T getRecords(T records) throws IOException {
339        CSVRecord rec;
340        while ((rec = this.nextRecord()) != null) {
341            records.add(rec);
342        }
343        return records;
344    }
345
346    /**
347     * Initializes the name to index mapping if the format defines a header.
348     *
349     * @return null if the format has no header.
350     */
351    private Map<String, Integer> initializeHeader() throws IOException {
352        Map<String, Integer> hdrMap = null;
353        final String[] formatHeader = this.format.getHeader();
354        if (formatHeader != null) {
355            hdrMap = new LinkedHashMap<String, Integer>();
356
357            String[] header = null;
358            if (formatHeader.length == 0) {
359                // read the header from the first line of the file
360                final CSVRecord nextRecord = this.nextRecord();
361                if (nextRecord != null) {
362                    header = nextRecord.values();
363                }
364            } else {
365                if (this.format.getSkipHeaderRecord()) {
366                    this.nextRecord();
367                }
368                header = formatHeader;
369            }
370
371            // build the name to index mappings
372            if (header != null) {
373                for (int i = 0; i < header.length; i++) {
374                    if (hdrMap.containsKey(header[i])) {
375                        throw new IllegalStateException("The header contains duplicate names: "
376                                + Arrays.toString(header));
377                    }
378                    hdrMap.put(header[i], Integer.valueOf(i));
379                }
380            }
381        }
382        return hdrMap;
383    }
384
385    public boolean isClosed() {
386        return this.lexer.isClosed();
387    }
388
389    /**
390     * Returns an iterator on the records.
391     *
392     * <p>IOExceptions occurring during the iteration are wrapped in a
393     * RuntimeException.
394     * If the parser is closed a call to {@code next()} will throw a
395     * NoSuchElementException.</p>
396     */
397    public Iterator<CSVRecord> iterator() {
398        return new Iterator<CSVRecord>() {
399            private CSVRecord current;
400
401            private CSVRecord getNextRecord() {
402                try {
403                    return CSVParser.this.nextRecord();
404                } catch (final IOException e) {
405                    // TODO: This is not great, throw an ISE instead?
406                    throw new RuntimeException(e);
407                }
408            }
409
410            public boolean hasNext() {
411                if (CSVParser.this.isClosed()) {
412                    return false;
413                }
414                if (this.current == null) {
415                    this.current = this.getNextRecord();
416                }
417
418                return this.current != null;
419            }
420
421            public CSVRecord next() {
422                if (CSVParser.this.isClosed()) {
423                    throw new NoSuchElementException("CSVParser has been closed");
424                }
425                CSVRecord next = this.current;
426                this.current = null;
427
428                if (next == null) {
429                    // hasNext() wasn't called before
430                    next = this.getNextRecord();
431                    if (next == null) {
432                        throw new NoSuchElementException("No more CSV records available");
433                    }
434                }
435
436                return next;
437            }
438
439            public void remove() {
440                throw new UnsupportedOperationException();
441            }
442        };
443    }
444
445    /**
446     * Parses the next record from the current point in the stream.
447     *
448     * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
449     * @throws IOException
450     *             on parse error or input read-failure
451     */
452    CSVRecord nextRecord() throws IOException {
453        CSVRecord result = null;
454        this.record.clear();
455        StringBuilder sb = null;
456        do {
457            this.reusableToken.reset();
458            this.lexer.nextToken(this.reusableToken);
459            switch (this.reusableToken.type) {
460            case TOKEN:
461                this.addRecordValue();
462                break;
463            case EORECORD:
464                this.addRecordValue();
465                break;
466            case EOF:
467                if (this.reusableToken.isReady) {
468                    this.addRecordValue();
469                }
470                break;
471            case INVALID:
472                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
473            case COMMENT: // Ignored currently
474                if (sb == null) { // first comment for this record
475                    sb = new StringBuilder();
476                } else {
477                    sb.append(Constants.LF);
478                }
479                sb.append(this.reusableToken.content);
480                this.reusableToken.type = TOKEN; // Read another token
481                break;
482            default:
483                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
484            }
485        } while (this.reusableToken.type == TOKEN);
486
487        if (!this.record.isEmpty()) {
488            this.recordNumber++;
489            final String comment = sb == null ? null : sb.toString();
490            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
491                    this.recordNumber);
492        }
493        return result;
494    }
495
496}