001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import java.io.Closeable;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.URL;
028import java.nio.charset.Charset;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.Iterator;
032import java.util.LinkedHashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.NoSuchElementException;
036import java.util.TreeMap;
037
038import static org.apache.commons.csv.Token.Type.*;
039
040/**
041 * Parses CSV files according to the specified format.
042 *
043 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
044 * specification of a {@link CSVFormat}.
045 *
046 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
047 *
048 * <h2>Creating instances</h2>
049 * <p>
050 * There are several static factory methods that can be used to create instances for various types of resources:
051 * </p>
052 * <ul>
053 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
054 *     <li>{@link #parse(String, CSVFormat)}</li>
055 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
056 * </ul>
057 * <p>
058 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
059 *
060 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
061 * </p>
062 * <pre>
063 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
064 *     ...
065 * }
066 * </pre>
067 *
068 * <h2>Parsing record wise</h2>
069 * <p>
070 * To parse a CSV input from a file, you write:
071 * </p>
072 *
073 * <pre>
074 * File csvData = new File(&quot;/path/to/csv&quot;);
075 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
076 * for (CSVRecord csvRecord : parser) {
077 *     ...
078 * }
079 * </pre>
080 *
081 * <p>
082 * This will read the parse the contents of the file using the
083 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
084 * </p>
085 *
086 * <p>
087 * To parse CSV input in a format like Excel, you write:
088 * </p>
089 *
090 * <pre>
091 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
092 * for (CSVRecord csvRecord : parser) {
093 *     ...
094 * }
095 * </pre>
096 *
097 * <p>
098 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
099 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
100 * </p>
101 *
102 * <h2>Parsing into memory</h2>
103 * <p>
104 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
105 * </p>
106 *
107 * <pre>
108 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
109 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
110 * List&lt;CSVRecord&gt; list = parser.getRecords();
111 * </pre>
112 *
113 * <p>
114 * There are two constraints that have to be kept in mind:
115 * </p>
116 *
117 * <ol>
118 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
119 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
120 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
121 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
122 * </ol>
123 *
124 * <h2>Notes</h2>
125 * <p>
126 * Internal parser state is completely covered by the format and the reader-state.
127 * </p>
128 *
129 * @version $Id$
130 *
131 * @see <a href="package-summary.html">package documentation for more details</a>
132 */
133public final class CSVParser implements Iterable<CSVRecord>, Closeable {
134
135    /**
136     * Creates a parser for the given {@link File}.
137     *
138     * <p><strong>Note:</strong> This method internally creates a FileReader using
139     * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
140     * is executing the code. If this is insufficient create a URL to the file and use
141     * {@link #parse(URL, Charset, CSVFormat)}</p>
142     *
143     * @param file
144     *            a CSV file. Must not be null.
145     * @param charset
146     *            A charset
147     * @param format
148     *            the CSVFormat used for CSV parsing. Must not be null.
149     * @return a new parser
150     * @throws IllegalArgumentException
151     *             If the parameters of the format are inconsistent or if either file or format are null.
152     * @throws IOException
153     *             If an I/O error occurs
154     */
155    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
156        Assertions.notNull(file, "file");
157        Assertions.notNull(format, "format");
158        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
159    }
160
161    /**
162     * Creates a parser for the given {@link String}.
163     *
164     * @param string
165     *            a CSV string. Must not be null.
166     * @param format
167     *            the CSVFormat used for CSV parsing. Must not be null.
168     * @return a new parser
169     * @throws IllegalArgumentException
170     *             If the parameters of the format are inconsistent or if either string or format are null.
171     * @throws IOException
172     *             If an I/O error occurs
173     */
174    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
175        Assertions.notNull(string, "string");
176        Assertions.notNull(format, "format");
177
178        return new CSVParser(new StringReader(string), format);
179    }
180
181    /**
182     * Creates a parser for the given URL.
183     *
184     * <p>
185     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
186     * you close the {@code url}.
187     * </p>
188     *
189     * @param url
190     *            a URL. Must not be null.
191     * @param charset
192     *            the charset for the resource. Must not be null.
193     * @param format
194     *            the CSVFormat used for CSV parsing. Must not be null.
195     * @return a new parser
196     * @throws IllegalArgumentException
197     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
198     * @throws IOException
199     *             If an I/O error occurs
200     */
201    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
202        Assertions.notNull(url, "url");
203        Assertions.notNull(charset, "charset");
204        Assertions.notNull(format, "format");
205
206        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
207    }
208
209    // the following objects are shared to reduce garbage
210
211    private final CSVFormat format;
212
213    /** A mapping of column names to column indices */
214    private final Map<String, Integer> headerMap;
215
216    private final Lexer lexer;
217
218    /** A record buffer for getRecord(). Grows as necessary and is reused. */
219    private final List<String> record = new ArrayList<>();
220
221    /**
222     * The next record number to assign.
223     */
224    private long recordNumber;
225
226    /**
227     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
228     * with {@link #recordNumber}.
229     */
230    private final long characterOffset;
231
232    private final Token reusableToken = new Token();
233
234    /**
235     * Customized CSV parser using the given {@link CSVFormat}
236     *
237     * <p>
238     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
239     * unless you close the {@code reader}.
240     * </p>
241     *
242     * @param reader
243     *            a Reader containing CSV-formatted input. Must not be null.
244     * @param format
245     *            the CSVFormat used for CSV parsing. Must not be null.
246     * @throws IllegalArgumentException
247     *             If the parameters of the format are inconsistent or if either reader or format are null.
248     * @throws IOException
249     *             If there is a problem reading the header or skipping the first record
250     */
251    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
252        this(reader, format, 0, 1);
253    }
254
255    /**
256     * Customized CSV parser using the given {@link CSVFormat}
257     *
258     * <p>
259     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
260     * unless you close the {@code reader}.
261     * </p>
262     *
263     * @param reader
264     *            a Reader containing CSV-formatted input. Must not be null.
265     * @param format
266     *            the CSVFormat used for CSV parsing. Must not be null.
267     * @param characterOffset
268     *            Lexer offset when the parser does not start parsing at the beginning of the source.
269     * @param recordNumber
270     *            The next record number to assign
271     * @throws IllegalArgumentException
272     *             If the parameters of the format are inconsistent or if either reader or format are null.
273     * @throws IOException
274     *             If there is a problem reading the header or skipping the first record
275     * @since 1.1
276     */
277    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
278            throws IOException {
279        Assertions.notNull(reader, "reader");
280        Assertions.notNull(format, "format");
281
282        this.format = format;
283        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
284        this.headerMap = this.initializeHeader();
285        this.characterOffset = characterOffset;
286        this.recordNumber = recordNumber - 1;
287    }
288
289    private void addRecordValue(final boolean lastRecord) {
290        final String input = this.reusableToken.content.toString();
291        final String inputClean = this.format.getTrim() ? input.trim() : input;
292        if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
293            return;
294        }
295        final String nullString = this.format.getNullString();
296        this.record.add(inputClean.equals(nullString) ? null : inputClean);
297    }
298
299    /**
300     * Closes resources.
301     *
302     * @throws IOException
303     *             If an I/O error occurs
304     */
305    @Override
306    public void close() throws IOException {
307        if (this.lexer != null) {
308            this.lexer.close();
309        }
310    }
311
312    /**
313     * Returns the current line number in the input stream.
314     *
315     * <p>
316     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
317     * the record number.
318     * </p>
319     *
320     * @return current line number
321     */
322    public long getCurrentLineNumber() {
323        return this.lexer.getCurrentLineNumber();
324    }
325
326    /**
327     * Returns a copy of the header map that iterates in column order.
328     * <p>
329     * The map keys are column names. The map values are 0-based indices.
330     * </p>
331     * @return a copy of the header map that iterates in column order.
332     */
333    public Map<String, Integer> getHeaderMap() {
334        return this.headerMap == null ? null : new LinkedHashMap<>(this.headerMap);
335    }
336
337    /**
338     * Returns the current record number in the input stream.
339     *
340     * <p>
341     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
342     * the line number.
343     * </p>
344     *
345     * @return current record number
346     */
347    public long getRecordNumber() {
348        return this.recordNumber;
349    }
350
351    /**
352     * Parses the CSV input according to the given format and returns the content as a list of
353     * {@link CSVRecord CSVRecords}.
354     *
355     * <p>
356     * The returned content starts at the current parse-position in the stream.
357     * </p>
358     *
359     * @return list of {@link CSVRecord CSVRecords}, may be empty
360     * @throws IOException
361     *             on parse error or input read-failure
362     */
363    public List<CSVRecord> getRecords() throws IOException {
364        CSVRecord rec;
365        final List<CSVRecord> records = new ArrayList<>();
366        while ((rec = this.nextRecord()) != null) {
367            records.add(rec);
368        }
369        return records;
370    }
371
372    /**
373     * Initializes the name to index mapping if the format defines a header.
374     *
375     * @return null if the format has no header.
376     * @throws IOException if there is a problem reading the header or skipping the first record
377     */
378    private Map<String, Integer> initializeHeader() throws IOException {
379        Map<String, Integer> hdrMap = null;
380        final String[] formatHeader = this.format.getHeader();
381        if (formatHeader != null) {
382            hdrMap = this.format.getIgnoreHeaderCase() ?
383                    new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER) :
384                    new LinkedHashMap<String, Integer>();
385
386            String[] headerRecord = null;
387            if (formatHeader.length == 0) {
388                // read the header from the first line of the file
389                final CSVRecord nextRecord = this.nextRecord();
390                if (nextRecord != null) {
391                    headerRecord = nextRecord.values();
392                }
393            } else {
394                if (this.format.getSkipHeaderRecord()) {
395                    this.nextRecord();
396                }
397                headerRecord = formatHeader;
398            }
399
400            // build the name to index mappings
401            if (headerRecord != null) {
402                for (int i = 0; i < headerRecord.length; i++) {
403                    final String header = headerRecord[i];
404                    final boolean containsHeader = hdrMap.containsKey(header);
405                    final boolean emptyHeader = header == null || header.trim().isEmpty();
406                    if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
407                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
408                                "\" in " + Arrays.toString(headerRecord));
409                    }
410                    hdrMap.put(header, Integer.valueOf(i));
411                }
412            }
413        }
414        return hdrMap;
415    }
416
417    /**
418     * Gets whether this parser is closed.
419     *
420     * @return whether this parser is closed.
421     */
422    public boolean isClosed() {
423        return this.lexer.isClosed();
424    }
425
426    /**
427     * Returns an iterator on the records.
428     *
429     * <p>IOExceptions occurring during the iteration are wrapped in a
430     * RuntimeException.
431     * If the parser is closed a call to {@code next()} will throw a
432     * NoSuchElementException.</p>
433     */
434    @Override
435    public Iterator<CSVRecord> iterator() {
436        return new Iterator<CSVRecord>() {
437            private CSVRecord current;
438
439            private CSVRecord getNextRecord() {
440                try {
441                    return CSVParser.this.nextRecord();
442                } catch (final IOException e) {
443                    // TODO: This is not great, throw an ISE instead?
444                    throw new RuntimeException(e);
445                }
446            }
447
448            @Override
449            public boolean hasNext() {
450                if (CSVParser.this.isClosed()) {
451                    return false;
452                }
453                if (this.current == null) {
454                    this.current = this.getNextRecord();
455                }
456
457                return this.current != null;
458            }
459
460            @Override
461            public CSVRecord next() {
462                if (CSVParser.this.isClosed()) {
463                    throw new NoSuchElementException("CSVParser has been closed");
464                }
465                CSVRecord next = this.current;
466                this.current = null;
467
468                if (next == null) {
469                    // hasNext() wasn't called before
470                    next = this.getNextRecord();
471                    if (next == null) {
472                        throw new NoSuchElementException("No more CSV records available");
473                    }
474                }
475
476                return next;
477            }
478
479            @Override
480            public void remove() {
481                throw new UnsupportedOperationException();
482            }
483        };
484    }
485
486    /**
487     * Parses the next record from the current point in the stream.
488     *
489     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
490     * @throws IOException
491     *             on parse error or input read-failure
492     */
493    CSVRecord nextRecord() throws IOException {
494        CSVRecord result = null;
495        this.record.clear();
496        StringBuilder sb = null;
497        final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
498        do {
499            this.reusableToken.reset();
500            this.lexer.nextToken(this.reusableToken);
501            switch (this.reusableToken.type) {
502            case TOKEN:
503                this.addRecordValue(false);
504                break;
505            case EORECORD:
506                this.addRecordValue(true);
507                break;
508            case EOF:
509                if (this.reusableToken.isReady) {
510                    this.addRecordValue(true);
511                }
512                break;
513            case INVALID:
514                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
515            case COMMENT: // Ignored currently
516                if (sb == null) { // first comment for this record
517                    sb = new StringBuilder();
518                } else {
519                    sb.append(Constants.LF);
520                }
521                sb.append(this.reusableToken.content);
522                this.reusableToken.type = TOKEN; // Read another token
523                break;
524            default:
525                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
526            }
527        } while (this.reusableToken.type == TOKEN);
528
529        if (!this.record.isEmpty()) {
530            this.recordNumber++;
531            final String comment = sb == null ? null : sb.toString();
532            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
533                    this.recordNumber, startCharPosition);
534        }
535        return result;
536    }
537
538}