001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import static org.apache.commons.csv.Token.Type.TOKEN;
021
022import java.io.Closeable;
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.Reader;
029import java.io.StringReader;
030import java.net.URL;
031import java.nio.charset.Charset;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.util.ArrayList;
035import java.util.Arrays;
036import java.util.Iterator;
037import java.util.LinkedHashMap;
038import java.util.List;
039import java.util.Map;
040import java.util.NoSuchElementException;
041import java.util.TreeMap;
042
043/**
044 * Parses CSV files according to the specified format.
045 *
046 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
047 * specification of a {@link CSVFormat}.
048 *
049 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
050 *
051 * <h2>Creating instances</h2>
052 * <p>
053 * There are several static factory methods that can be used to create instances for various types of resources:
054 * </p>
055 * <ul>
056 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
057 *     <li>{@link #parse(String, CSVFormat)}</li>
058 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
059 * </ul>
060 * <p>
061 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
062 *
063 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
064 * </p>
065 * <pre>
066 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
067 *     ...
068 * }
069 * </pre>
070 *
071 * <h2>Parsing record wise</h2>
072 * <p>
073 * To parse a CSV input from a file, you write:
074 * </p>
075 *
076 * <pre>
077 * File csvData = new File(&quot;/path/to/csv&quot;);
078 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
079 * for (CSVRecord csvRecord : parser) {
080 *     ...
081 * }
082 * </pre>
083 *
084 * <p>
085 * This will read the parse the contents of the file using the
086 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
087 * </p>
088 *
089 * <p>
090 * To parse CSV input in a format like Excel, you write:
091 * </p>
092 *
093 * <pre>
094 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
095 * for (CSVRecord csvRecord : parser) {
096 *     ...
097 * }
098 * </pre>
099 *
100 * <p>
101 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
102 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
103 * </p>
104 *
105 * <h2>Parsing into memory</h2>
106 * <p>
107 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
108 * </p>
109 *
110 * <pre>
111 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
112 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
113 * List&lt;CSVRecord&gt; list = parser.getRecords();
114 * </pre>
115 *
116 * <p>
117 * There are two constraints that have to be kept in mind:
118 * </p>
119 *
120 * <ol>
121 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
122 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
123 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
124 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
125 * </ol>
126 *
127 * <h2>Notes</h2>
128 * <p>
129 * Internal parser state is completely covered by the format and the reader-state.
130 * </p>
131 *
132 * @see <a href="package-summary.html">package documentation for more details</a>
133 */
134public final class CSVParser implements Iterable<CSVRecord>, Closeable {
135
136    /**
137     * Creates a parser for the given {@link File}.
138     *
139     * @param file
140     *            a CSV file. Must not be null.
141     * @param charset
142     *            A Charset
143     * @param format
144     *            the CSVFormat used for CSV parsing. Must not be null.
145     * @return a new parser
146     * @throws IllegalArgumentException
147     *             If the parameters of the format are inconsistent or if either file or format are null.
148     * @throws IOException
149     *             If an I/O error occurs
150     */
151    @SuppressWarnings("resource")
152    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
153        Assertions.notNull(file, "file");
154        Assertions.notNull(format, "format");
155        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
156    }
157
158    /**
159     * Creates a CSV parser using the given {@link CSVFormat}.
160     *
161     * <p>
162     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
163     * unless you close the {@code reader}.
164     * </p>
165     *
166     * @param inputStream
167     *            an InputStream containing CSV-formatted input. Must not be null.
168     * @param charset
169     *            a Charset.
170     * @param format
171     *            the CSVFormat used for CSV parsing. Must not be null.
172     * @return a new CSVParser configured with the given reader and format.
173     * @throws IllegalArgumentException
174     *             If the parameters of the format are inconsistent or if either reader or format are null.
175     * @throws IOException
176     *             If there is a problem reading the header or skipping the first record
177     * @since 1.5
178     */
179    @SuppressWarnings("resource")
180    public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
181            throws IOException {
182        Assertions.notNull(inputStream, "inputStream");
183        Assertions.notNull(format, "format");
184        return parse(new InputStreamReader(inputStream, charset), format);
185    }
186
187    /**
188     * Creates a parser for the given {@link Path}.
189     *
190     * @param path
191     *            a CSV file. Must not be null.
192     * @param charset
193     *            A Charset
194     * @param format
195     *            the CSVFormat used for CSV parsing. Must not be null.
196     * @return a new parser
197     * @throws IllegalArgumentException
198     *             If the parameters of the format are inconsistent or if either file or format are null.
199     * @throws IOException
200     *             If an I/O error occurs
201     * @since 1.5
202     */
203    public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
204        Assertions.notNull(path, "path");
205        Assertions.notNull(format, "format");
206        return parse(Files.newBufferedReader(path, charset), format);
207    }
208
209    /**
210     * Creates a CSV parser using the given {@link CSVFormat}
211     *
212     * <p>
213     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
214     * unless you close the {@code reader}.
215     * </p>
216     *
217     * @param reader
218     *            a Reader containing CSV-formatted input. Must not be null.
219     * @param format
220     *            the CSVFormat used for CSV parsing. Must not be null.
221     * @return a new CSVParser configured with the given reader and format.
222     * @throws IllegalArgumentException
223     *             If the parameters of the format are inconsistent or if either reader or format are null.
224     * @throws IOException
225     *             If there is a problem reading the header or skipping the first record
226     * @since 1.5
227     */
228    public static CSVParser parse(Reader reader, final CSVFormat format) throws IOException {
229        return new CSVParser(reader, format);
230    }
231
232    /**
233     * Creates a parser for the given {@link String}.
234     *
235     * @param string
236     *            a CSV string. Must not be null.
237     * @param format
238     *            the CSVFormat used for CSV parsing. Must not be null.
239     * @return a new parser
240     * @throws IllegalArgumentException
241     *             If the parameters of the format are inconsistent or if either string or format are null.
242     * @throws IOException
243     *             If an I/O error occurs
244     */
245    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
246        Assertions.notNull(string, "string");
247        Assertions.notNull(format, "format");
248
249        return new CSVParser(new StringReader(string), format);
250    }
251
252    /**
253     * Creates a parser for the given URL.
254     *
255     * <p>
256     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
257     * you close the {@code url}.
258     * </p>
259     *
260     * @param url
261     *            a URL. Must not be null.
262     * @param charset
263     *            the charset for the resource. Must not be null.
264     * @param format
265     *            the CSVFormat used for CSV parsing. Must not be null.
266     * @return a new parser
267     * @throws IllegalArgumentException
268     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
269     * @throws IOException
270     *             If an I/O error occurs
271     */
272    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
273        Assertions.notNull(url, "url");
274        Assertions.notNull(charset, "charset");
275        Assertions.notNull(format, "format");
276
277        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
278    }
279
280    // the following objects are shared to reduce garbage
281
282    private final CSVFormat format;
283
284    /** A mapping of column names to column indices */
285    private final Map<String, Integer> headerMap;
286
287    private final Lexer lexer;
288
289    /** A record buffer for getRecord(). Grows as necessary and is reused. */
290    private final List<String> recordList = new ArrayList<>();
291
292    /**
293     * The next record number to assign.
294     */
295    private long recordNumber;
296
297    /**
298     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
299     * with {@link #recordNumber}.
300     */
301    private final long characterOffset;
302
303    private final Token reusableToken = new Token();
304
305    /**
306     * Customized CSV parser using the given {@link CSVFormat}
307     *
308     * <p>
309     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
310     * unless you close the {@code reader}.
311     * </p>
312     *
313     * @param reader
314     *            a Reader containing CSV-formatted input. Must not be null.
315     * @param format
316     *            the CSVFormat used for CSV parsing. Must not be null.
317     * @throws IllegalArgumentException
318     *             If the parameters of the format are inconsistent or if either reader or format are null.
319     * @throws IOException
320     *             If there is a problem reading the header or skipping the first record
321     */
322    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
323        this(reader, format, 0, 1);
324    }
325
326    /**
327     * Customized CSV parser using the given {@link CSVFormat}
328     *
329     * <p>
330     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
331     * unless you close the {@code reader}.
332     * </p>
333     *
334     * @param reader
335     *            a Reader containing CSV-formatted input. Must not be null.
336     * @param format
337     *            the CSVFormat used for CSV parsing. Must not be null.
338     * @param characterOffset
339     *            Lexer offset when the parser does not start parsing at the beginning of the source.
340     * @param recordNumber
341     *            The next record number to assign
342     * @throws IllegalArgumentException
343     *             If the parameters of the format are inconsistent or if either reader or format are null.
344     * @throws IOException
345     *             If there is a problem reading the header or skipping the first record
346     * @since 1.1
347     */
348    @SuppressWarnings("resource")
349    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
350            throws IOException {
351        Assertions.notNull(reader, "reader");
352        Assertions.notNull(format, "format");
353
354        this.format = format;
355        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
356        this.headerMap = this.initializeHeader();
357        this.characterOffset = characterOffset;
358        this.recordNumber = recordNumber - 1;
359    }
360
361    private void addRecordValue(final boolean lastRecord) {
362        final String input = this.reusableToken.content.toString();
363        final String inputClean = this.format.getTrim() ? input.trim() : input;
364        if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
365            return;
366        }
367        final String nullString = this.format.getNullString();
368        this.recordList.add(inputClean.equals(nullString) ? null : inputClean);
369    }
370
371    /**
372     * Closes resources.
373     *
374     * @throws IOException
375     *             If an I/O error occurs
376     */
377    @Override
378    public void close() throws IOException {
379        if (this.lexer != null) {
380            this.lexer.close();
381        }
382    }
383
384    /**
385     * Returns the current line number in the input stream.
386     *
387     * <p>
388     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
389     * the record number.
390     * </p>
391     *
392     * @return current line number
393     */
394    public long getCurrentLineNumber() {
395        return this.lexer.getCurrentLineNumber();
396    }
397
398    /**
399     * Gets the first end-of-line string encountered.
400     *
401     * @return the first end-of-line string
402     * @since 1.5
403     */
404    public String getFirstEndOfLine() {
405        return lexer.getFirstEol();
406    }
407
408    /**
409     * Returns a copy of the header map that iterates in column order.
410     * <p>
411     * The map keys are column names. The map values are 0-based indices.
412     * </p>
413     * @return a copy of the header map that iterates in column order.
414     */
415    public Map<String, Integer> getHeaderMap() {
416        return this.headerMap == null ? null : new LinkedHashMap<>(this.headerMap);
417    }
418
419    /**
420     * Returns the current record number in the input stream.
421     *
422     * <p>
423     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
424     * the line number.
425     * </p>
426     *
427     * @return current record number
428     */
429    public long getRecordNumber() {
430        return this.recordNumber;
431    }
432
433    /**
434     * Parses the CSV input according to the given format and returns the content as a list of
435     * {@link CSVRecord CSVRecords}.
436     *
437     * <p>
438     * The returned content starts at the current parse-position in the stream.
439     * </p>
440     *
441     * @return list of {@link CSVRecord CSVRecords}, may be empty
442     * @throws IOException
443     *             on parse error or input read-failure
444     */
445    public List<CSVRecord> getRecords() throws IOException {
446        CSVRecord rec;
447        final List<CSVRecord> records = new ArrayList<>();
448        while ((rec = this.nextRecord()) != null) {
449            records.add(rec);
450        }
451        return records;
452    }
453
454    /**
455     * Initializes the name to index mapping if the format defines a header.
456     *
457     * @return null if the format has no header.
458     * @throws IOException if there is a problem reading the header or skipping the first record
459     */
460    private Map<String, Integer> initializeHeader() throws IOException {
461        Map<String, Integer> hdrMap = null;
462        final String[] formatHeader = this.format.getHeader();
463        if (formatHeader != null) {
464            hdrMap = this.format.getIgnoreHeaderCase() ?
465                    new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER) :
466                    new LinkedHashMap<String, Integer>();
467
468            String[] headerRecord = null;
469            if (formatHeader.length == 0) {
470                // read the header from the first line of the file
471                final CSVRecord nextRecord = this.nextRecord();
472                if (nextRecord != null) {
473                    headerRecord = nextRecord.values();
474                }
475            } else {
476                if (this.format.getSkipHeaderRecord()) {
477                    this.nextRecord();
478                }
479                headerRecord = formatHeader;
480            }
481
482            // build the name to index mappings
483            if (headerRecord != null) {
484                for (int i = 0; i < headerRecord.length; i++) {
485                    final String header = headerRecord[i];
486                    final boolean containsHeader = hdrMap.containsKey(header);
487                    final boolean emptyHeader = header == null || header.trim().isEmpty();
488                    if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
489                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
490                                "\" in " + Arrays.toString(headerRecord));
491                    }
492                    hdrMap.put(header, Integer.valueOf(i));
493                }
494            }
495        }
496        return hdrMap;
497    }
498
499    /**
500     * Gets whether this parser is closed.
501     *
502     * @return whether this parser is closed.
503     */
504    public boolean isClosed() {
505        return this.lexer.isClosed();
506    }
507
508    /**
509     * Returns an iterator on the records.
510     *
511     * <p>
512     * An {@link IOException} caught during the iteration are re-thrown as an
513     * {@link IllegalStateException}.
514     * </p>
515     * <p>
516     * If the parser is closed a call to {@link Iterator#next()} will throw a
517     * {@link NoSuchElementException}.
518     * </p>
519     */
520    @Override
521    public Iterator<CSVRecord> iterator() {
522        return new Iterator<CSVRecord>() {
523            private CSVRecord current;
524
525            private CSVRecord getNextRecord() {
526                try {
527                    return CSVParser.this.nextRecord();
528                } catch (final IOException e) {
529                    throw new IllegalStateException(
530                            e.getClass().getSimpleName() + " reading next record: " + e.toString(), e);
531                }
532            }
533
534            @Override
535            public boolean hasNext() {
536                if (CSVParser.this.isClosed()) {
537                    return false;
538                }
539                if (this.current == null) {
540                    this.current = this.getNextRecord();
541                }
542
543                return this.current != null;
544            }
545
546            @Override
547            public CSVRecord next() {
548                if (CSVParser.this.isClosed()) {
549                    throw new NoSuchElementException("CSVParser has been closed");
550                }
551                CSVRecord next = this.current;
552                this.current = null;
553
554                if (next == null) {
555                    // hasNext() wasn't called before
556                    next = this.getNextRecord();
557                    if (next == null) {
558                        throw new NoSuchElementException("No more CSV records available");
559                    }
560                }
561
562                return next;
563            }
564
565            @Override
566            public void remove() {
567                throw new UnsupportedOperationException();
568            }
569        };
570    }
571
572    /**
573     * Parses the next record from the current point in the stream.
574     *
575     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
576     * @throws IOException
577     *             on parse error or input read-failure
578     */
579    CSVRecord nextRecord() throws IOException {
580        CSVRecord result = null;
581        this.recordList.clear();
582        StringBuilder sb = null;
583        final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
584        do {
585            this.reusableToken.reset();
586            this.lexer.nextToken(this.reusableToken);
587            switch (this.reusableToken.type) {
588            case TOKEN:
589                this.addRecordValue(false);
590                break;
591            case EORECORD:
592                this.addRecordValue(true);
593                break;
594            case EOF:
595                if (this.reusableToken.isReady) {
596                    this.addRecordValue(true);
597                }
598                break;
599            case INVALID:
600                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
601            case COMMENT: // Ignored currently
602                if (sb == null) { // first comment for this record
603                    sb = new StringBuilder();
604                } else {
605                    sb.append(Constants.LF);
606                }
607                sb.append(this.reusableToken.content);
608                this.reusableToken.type = TOKEN; // Read another token
609                break;
610            default:
611                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
612            }
613        } while (this.reusableToken.type == TOKEN);
614
615        if (!this.recordList.isEmpty()) {
616            this.recordNumber++;
617            final String comment = sb == null ? null : sb.toString();
618            result = new CSVRecord(this.recordList.toArray(new String[this.recordList.size()]), this.headerMap, comment,
619                    this.recordNumber, startCharPosition);
620        }
621        return result;
622    }
623
624}