001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019
020package org.apache.commons.csv;
021
022import static org.apache.commons.csv.Token.Type.TOKEN;
023
024import java.io.Closeable;
025import java.io.File;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.InputStreamReader;
029import java.io.Reader;
030import java.io.StringReader;
031import java.io.UncheckedIOException;
032import java.net.URL;
033import java.nio.charset.Charset;
034import java.nio.file.Files;
035import java.nio.file.Path;
036import java.util.ArrayList;
037import java.util.Arrays;
038import java.util.Collections;
039import java.util.Iterator;
040import java.util.LinkedHashMap;
041import java.util.List;
042import java.util.Map;
043import java.util.NoSuchElementException;
044import java.util.Objects;
045import java.util.Spliterator;
046import java.util.Spliterators;
047import java.util.TreeMap;
048import java.util.stream.Collectors;
049import java.util.stream.Stream;
050import java.util.stream.StreamSupport;
051
052import org.apache.commons.io.Charsets;
053import org.apache.commons.io.build.AbstractStreamBuilder;
054import org.apache.commons.io.function.Uncheck;
055
056/**
057 * Parses CSV files according to the specified format.
058 *
059 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
060 * specification of a {@link CSVFormat}.
061 *
062 * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
063 *
064 * <h2>Creating instances</h2>
065 * <p>
066 * There are several static factory methods that can be used to create instances for various types of resources:
067 * </p>
068 * <ul>
069 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
070 *     <li>{@link #parse(String, CSVFormat)}</li>
071 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
072 * </ul>
073 * <p>
074 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
075 *
076 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
077 * </p>
078 * <pre>
079 * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
080 *     ...
081 * }
082 * </pre>
083 *
084 * <h2>Parsing record wise</h2>
085 * <p>
086 * To parse a CSV input from a file, you write:
087 * </p>
088 *
089 * <pre>{@code
090 * File csvData = new File("/path/to/csv");
091 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
092 * for (CSVRecord csvRecord : parser) {
093 *     ...
094 * }}
095 * </pre>
096 *
097 * <p>
098 * This will read the parse the contents of the file using the
099 * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
100 * </p>
101 *
102 * <p>
103 * To parse CSV input in a format like Excel, you write:
104 * </p>
105 *
106 * <pre>
107 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
108 * for (CSVRecord csvRecord : parser) {
109 *     ...
110 * }
111 * </pre>
112 *
113 * <p>
114 * If the predefined formats don't match the format at hand, custom formats can be defined. More information about
115 * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
116 * </p>
117 *
118 * <h2>Parsing into memory</h2>
119 * <p>
120 * If parsing record-wise is not desired, the contents of the input can be read completely into memory.
121 * </p>
122 *
123 * <pre>{@code
124 * Reader in = new StringReader("a;b\nc;d");
125 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
126 * List<CSVRecord> list = parser.getRecords();
127 * }</pre>
128 *
129 * <p>
130 * There are two constraints that have to be kept in mind:
131 * </p>
132 *
133 * <ol>
134 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
135 *     the input, those records will not end up in the in-memory representation of your CSV data.</li>
136 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
137 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
138 * </ol>
139 *
140 * <h2>Notes</h2>
141 * <p>
142 * The internal parser state is completely covered by the format and the reader state.
143 * </p>
144 *
145 * @see <a href="package-summary.html">package documentation for more details</a>
146 */
147public final class CSVParser implements Iterable<CSVRecord>, Closeable {
148
149    /**
150     * Builds a new {@link CSVParser}.
151     *
152     * @since 1.13.0
153     */
154    public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
155
156        private CSVFormat format;
157        private long characterOffset;
158        private long recordNumber = 1;
159        private boolean trackBytes;
160
161        /**
162         * Constructs a new instance.
163         */
164        protected Builder() {
165            // empty
166        }
167
168        @SuppressWarnings("resource")
169        @Override
170        public CSVParser get() throws IOException {
171            return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
172        }
173
174        /**
175         * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
176         *
177         * @param characterOffset the lexer offset.
178         * @return this instance.
179         */
180        public Builder setCharacterOffset(final long characterOffset) {
181            this.characterOffset = characterOffset;
182            return asThis();
183        }
184
185        /**
186         * Sets the CSV format. A copy of the given format is kept.
187         *
188         * @param format the CSV format, {@code null} resets to {@link CSVFormat#DEFAULT}.
189         * @return this instance.
190         */
191        public Builder setFormat(final CSVFormat format) {
192            this.format = CSVFormat.copy(format);
193            return asThis();
194        }
195
196        /**
197         * Sets the next record number to assign, defaults to {@code 1}.
198         *
199         * @param recordNumber the next record number to assign.
200         * @return this instance.
201         */
202        public Builder setRecordNumber(final long recordNumber) {
203            this.recordNumber = recordNumber;
204            return asThis();
205        }
206
207        /**
208         * Sets whether to enable byte tracking for the parser.
209         *
210         * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
211         * @return this instance.
212         * @since 1.13.0
213         */
214        public Builder setTrackBytes(final boolean trackBytes) {
215            this.trackBytes = trackBytes;
216            return asThis();
217        }
218
219    }
220
221    final class CSVRecordIterator implements Iterator<CSVRecord> {
222        private CSVRecord current;
223
224        /**
225         * Gets the next record or null at the end of stream or max rows read.
226         *
227         * @throws IOException  on parse error or input read-failure
228         * @throws CSVException on invalid input.
229         * @return the next record, or {@code null} if the end of the stream has been reached.
230         */
231        private CSVRecord getNextRecord() {
232            CSVRecord record = null;
233            if (format.useRow(recordNumber + 1)) {
234                record = Uncheck.get(CSVParser.this::nextRecord);
235            }
236            return record;
237        }
238
239        @Override
240        public boolean hasNext() {
241            if (isClosed()) {
242                return false;
243            }
244            if (current == null) {
245                current = getNextRecord();
246            }
247            return current != null;
248        }
249
250        @Override
251        public CSVRecord next() {
252            if (isClosed()) {
253                throw new NoSuchElementException("CSVParser has been closed");
254            }
255            CSVRecord next = current;
256            current = null;
257            if (next == null) {
258                // hasNext() wasn't called before
259                next = getNextRecord();
260                if (next == null) {
261                    throw new NoSuchElementException("No more CSV records available");
262                }
263            }
264            return next;
265        }
266
267        @Override
268        public void remove() {
269            throw new UnsupportedOperationException();
270        }
271    }
272    /**
273     * Header information based on name and position.
274     */
275    private static final class Headers {
276
277        /**
278         * Header column positions (0-based)
279         */
280        final Map<String, Integer> headerMap;
281
282        /**
283         * Header names in column order
284         */
285        final List<String> headerNames;
286
287        Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
288            this.headerMap = headerMap;
289            this.headerNames = headerNames;
290        }
291    }
292
293    /**
294     * Creates a new builder.
295     *
296     * @return a new builder.
297     * @since 1.13.0
298     */
299    public static Builder builder() {
300        return new Builder();
301    }
302
303    /**
304     * Creates a parser for the given {@link File}.
305     *
306     * @param file
307     *            a CSV file. Must not be null.
308     * @param charset
309     *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
310     * @param format
311     *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
312     * @return a new parser
313     * @throws IllegalArgumentException
314     *             If the parameters of the format are inconsistent.
315     * @throws IOException
316     *             If an I/O error occurs
317     * @throws CSVException Thrown on invalid CSV input data.
318     * @throws NullPointerException if {@code file} is {@code null}.
319     */
320    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
321        Objects.requireNonNull(file, "file");
322        return parse(file.toPath(), charset, format);
323    }
324
325    /**
326     * Creates a CSV parser using the given {@link CSVFormat}.
327     *
328     * <p>
329     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
330     * unless you close the {@code reader}.
331     * </p>
332     *
333     * @param inputStream
334     *            an InputStream containing CSV-formatted input, {@code null} maps to {@link CSVFormat#DEFAULT}.
335     * @param charset
336     *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
337     * @param format
338     *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
339     * @return a new CSVParser configured with the given reader and format.
340     * @throws IllegalArgumentException
341     *             If the parameters of the format are inconsistent or if either reader or format are null.
342     * @throws IOException
343     *             If there is a problem reading the header or skipping the first record
344     * @throws CSVException Thrown on invalid CSV input data.
345     * @since 1.5
346     */
347    public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
348            throws IOException {
349        return parse(new InputStreamReader(inputStream, Charsets.toCharset(charset)), format);
350    }
351
352    /**
353     * Creates and returns a parser for the given {@link Path}, which the caller MUST close.
354     *
355     * @param path
356     *            a CSV file. Must not be null.
357     * @param charset
358     *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
359     * @param format
360     *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
361     * @return a new parser
362     * @throws IllegalArgumentException
363     *             If the parameters of the format are inconsistent.
364     * @throws IOException
365     *             If an I/O error occurs
366     * @throws CSVException Thrown on invalid CSV input data.
367     * @throws NullPointerException if {@code path} is {@code null}.
368     * @since 1.5
369     */
370    @SuppressWarnings("resource")
371    public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
372        Objects.requireNonNull(path, "path");
373        return parse(Files.newInputStream(path), charset, format);
374    }
375
376    /**
377     * Creates a CSV parser using the given {@link CSVFormat}
378     *
379     * <p>
380     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
381     * unless you close the {@code reader}.
382     * </p>
383     *
384     * @param reader
385     *            a Reader containing CSV-formatted input. Must not be null.
386     * @param format
387     *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
388     * @return a new CSVParser configured with the given reader and format.
389     * @throws IllegalArgumentException
390     *             If the parameters of the format are inconsistent or if either reader or format are null.
391     * @throws IOException
392     *             If there is a problem reading the header or skipping the first record
393     * @throws CSVException Thrown on invalid CSV input data.
394     * @since 1.5
395     */
396    public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
397        return builder().setReader(reader).setFormat(format).get();
398    }
399
400    /**
401     * Creates a parser for the given {@link String}.
402     *
403     * @param string
404     *            a CSV string. Must not be null.
405     * @param format
406     *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
407     * @return a new parser
408     * @throws IllegalArgumentException
409     *             If the parameters of the format are inconsistent.
410     * @throws IOException
411     *             If an I/O error occurs
412     * @throws CSVException Thrown on invalid CSV input data.
413     * @throws NullPointerException if {@code string} is {@code null}.
414     */
415    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
416        Objects.requireNonNull(string, "string");
417        return parse(new StringReader(string), format);
418    }
419
420    /**
421     * Creates and returns a parser for the given URL, which the caller MUST close.
422     *
423     * <p>
424     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
425     * you close the {@code url}.
426     * </p>
427     *
428     * @param url
429     *            a URL. Must not be null.
430     * @param charset
431     *            the charset for the resource, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
432     * @param format
433     *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
434     * @return a new parser
435     * @throws IllegalArgumentException
436     *             If the parameters of the format are inconsistent.
437     * @throws IOException
438     *             If an I/O error occurs
439     * @throws CSVException Thrown on invalid CSV input data.
440     * @throws NullPointerException if {@code url} is {@code null}.
441     */
442    @SuppressWarnings("resource")
443    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
444        Objects.requireNonNull(url, "url");
445        return parse(url.openStream(), charset, format);
446    }
447
448    private String headerComment;
449
450    private String trailerComment;
451
452    private final CSVFormat format;
453
454    private final Headers headers;
455
456    private final Lexer lexer;
457
458    private final CSVRecordIterator csvRecordIterator;
459
460    /** A record buffer for getRecord(). Grows as necessary and is reused. */
461    private final List<String> recordList = new ArrayList<>();
462
463    /**
464     * The next record number to assign.
465     */
466    private long recordNumber;
467
468    /**
469     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
470     * with {@link #recordNumber}.
471     */
472    private final long characterOffset;
473
474    private final Token reusableToken = new Token();
475
476    /**
477     * Constructs a new instance using the given {@link CSVFormat}.
478     *
479     * <p>
480     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
481     * unless you close the {@code reader}.
482     * </p>
483     *
484     * @param reader
485     *            a Reader containing CSV-formatted input. Must not be null.
486     * @param format
487     *            the CSVFormat used for CSV parsing. Must not be null.
488     * @throws IllegalArgumentException
489     *             If the parameters of the format are inconsistent or if either reader or format are null.
490     * @throws IOException
491     *             If there is a problem reading the header or skipping the first record
492     * @throws CSVException Thrown on invalid CSV input data.
493     * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
494     */
495    @Deprecated
496    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
497        this(reader, format, 0, 1);
498    }
499
500    /**
501     * Constructs a new instance using the given {@link CSVFormat}.
502     *
503     * <p>
504     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
505     * unless you close the {@code reader}.
506     * </p>
507     *
508     * @param reader
509     *            a Reader containing CSV-formatted input. Must not be null.
510     * @param format
511     *            the CSVFormat used for CSV parsing. Must not be null.
512     * @param characterOffset
513     *            Lexer offset when the parser does not start parsing at the beginning of the source.
514     * @param recordNumber
515     *            The next record number to assign.
516     * @throws IllegalArgumentException
517     *             If the parameters of the format are inconsistent or if either the reader or format is null.
518     * @throws IOException
519     *             if there is a problem reading the header or skipping the first record
520     * @throws CSVException on invalid input.
521     * @since 1.1
522     * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
523     */
524    @Deprecated
525    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException {
526        this(reader, format, characterOffset, recordNumber, null, false);
527    }
528
529    /**
530     * Constructs a new instance using the given {@link CSVFormat}.
531     *
532     * <p>
533     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
534     * unless you close the {@code reader}.
535     * </p>
536     *
537     * @param reader
538     *            a Reader containing CSV-formatted input. Must not be null.
539     * @param format
540     *            the CSVFormat used for CSV parsing. Must not be null.
541     * @param characterOffset
542     *            Lexer offset when the parser does not start parsing at the beginning of the source.
543     * @param recordNumber
544     *            The next record number to assign.
545     * @param charset
546     *            The character encoding to be used for the reader when enableByteTracking is true.
547     * @param trackBytes
548     *           {@code true} to enable byte tracking for the parser; {@code false} to disable it.
549     * @throws IllegalArgumentException
550     *             If the parameters of the format are inconsistent or if either the reader or format is null.
551     * @throws IOException
552     *             If there is a problem reading the header or skipping the first record.
553     * @throws CSVException Thrown on invalid CSV input data.
554     */
555    private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
556        final Charset charset, final boolean trackBytes)
557        throws IOException {
558        Objects.requireNonNull(reader, "reader");
559        Objects.requireNonNull(format, "format");
560        this.format = format.copy();
561        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
562        this.csvRecordIterator = new CSVRecordIterator();
563        this.headers = createHeaders();
564        this.characterOffset = characterOffset;
565        this.recordNumber = recordNumber - 1;
566    }
567
568    private void addRecordValue(final boolean lastRecord) {
569        final String input = format.trim(reusableToken.content.toString());
570        if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
571            return;
572        }
573        recordList.add(handleNull(input));
574    }
575
576    /**
577     * Closes resources.
578     *
579     * @throws IOException
580     *             If an I/O error occurs
581     */
582    @Override
583    public void close() throws IOException {
584        lexer.close();
585    }
586
587    private Map<String, Integer> createEmptyHeaderMap() {
588        return format.getIgnoreHeaderCase() ?
589                new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
590                new LinkedHashMap<>();
591    }
592
593    /**
594     * Creates the name to index mapping if the format defines a header.
595     *
596     * @return null if the format has no header.
597     * @throws IOException if there is a problem reading the header or skipping the first record
598     * @throws CSVException on invalid input.
599     */
600    private Headers createHeaders() throws IOException {
601        Map<String, Integer> headerMap = null;
602        List<String> headerNames = null;
603        final String[] formatHeader = format.getHeader();
604        if (formatHeader != null) {
605            headerMap = createEmptyHeaderMap();
606            String[] headerRecord = null;
607            if (formatHeader.length == 0) {
608                // read the header from the first line of the file
609                final CSVRecord nextRecord = nextRecord();
610                if (nextRecord != null) {
611                    headerRecord = nextRecord.values();
612                    headerComment = nextRecord.getComment();
613                }
614            } else {
615                if (format.getSkipHeaderRecord()) {
616                    final CSVRecord nextRecord = nextRecord();
617                    if (nextRecord != null) {
618                        headerComment = nextRecord.getComment();
619                    }
620                }
621                headerRecord = formatHeader;
622            }
623            // build the name to index mappings
624            if (headerRecord != null) {
625                // Track an occurrence of a null, empty or blank header.
626                boolean observedMissing = false;
627                for (int i = 0; i < headerRecord.length; i++) {
628                    final String header = headerRecord[i];
629                    final boolean blankHeader = CSVFormat.isBlank(header);
630                    if (blankHeader && !format.getAllowMissingColumnNames()) {
631                        throw new IllegalArgumentException("A header name is missing in " + Arrays.toString(headerRecord));
632                    }
633                    final boolean containsHeader = blankHeader ? observedMissing : headerMap.containsKey(header);
634                    final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
635                    final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
636                    final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
637                    if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
638                        throw new IllegalArgumentException(String.format(
639                                "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
640                                header, Arrays.toString(headerRecord)));
641                    }
642                    observedMissing |= blankHeader;
643                    if (header != null) {
644                        headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional
645                        if (headerNames == null) {
646                            headerNames = new ArrayList<>(headerRecord.length);
647                        }
648                        headerNames.add(header);
649                    }
650                }
651            }
652        }
653        // Make header names Collection immutable
654        return new Headers(headerMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
655    }
656
657    /**
658     * Gets the current line number in the input stream.
659     *
660     * <p>
661     * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
662     * the record number.
663     * </p>
664     *
665     * @return current line number.
666     */
667    public long getCurrentLineNumber() {
668        return lexer.getCurrentLineNumber();
669    }
670
671    /**
672     * Gets the first end-of-line string encountered.
673     *
674     * @return the first end-of-line string.
675     * @since 1.5
676     */
677    public String getFirstEndOfLine() {
678        return lexer.getFirstEol();
679    }
680
681    /**
682     * Gets the header comment, if any.
683     * The header comment appears before the header record.
684     *
685     * @return the header comment for this stream, or null if no comment is available.
686     * @since 1.10.0
687     */
688    public String getHeaderComment() {
689        return headerComment;
690    }
691
692    /**
693     * Gets a copy of the header map as defined in the CSVFormat's header.
694     * <p>
695     * The map keys are column names. The map values are 0-based indices.
696     * </p>
697     * <p>
698     * <strong>Note:</strong> The map can only provide a one-to-one mapping when the format did not
699     * contain null or duplicate column names.
700     * </p>
701     *
702     * @return a copy of the header map.
703     */
704    public Map<String, Integer> getHeaderMap() {
705        if (headers.headerMap == null) {
706            return null;
707        }
708        final Map<String, Integer> map = createEmptyHeaderMap();
709        map.putAll(headers.headerMap);
710        return map;
711    }
712
713    /**
714     * Gets the underlying header map.
715     *
716     * @return the underlying header map.
717     */
718    Map<String, Integer> getHeaderMapRaw() {
719        return headers.headerMap;
720    }
721
722    /**
723     * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
724     * <p>
725     * Note: The list provides strings that can be used as keys in the header map.
726     * The list will not contain null column names if they were present in the input
727     * format.
728     * </p>
729     *
730     * @return read-only list of header names that iterates in column order.
731     * @see #getHeaderMap()
732     * @since 1.7
733     */
734    public List<String> getHeaderNames() {
735        return Collections.unmodifiableList(headers.headerNames);
736    }
737
738    /**
739     * Gets the current record number in the input stream.
740     *
741     * <p>
742     * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
743     * the line number.
744     * </p>
745     *
746     * @return current record number
747     */
748    public long getRecordNumber() {
749        return recordNumber;
750    }
751
752    /**
753     * Parses the CSV input according to the given format and returns the content as a list of
754     * {@link CSVRecord CSVRecords}.
755     *
756     * <p>
757     * The returned content starts at the current parse-position in the stream.
758     * </p>
759     * <p>
760     * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows this method produces.
761     * </p>
762     *
763     * @return list of {@link CSVRecord CSVRecords}, may be empty
764     * @throws UncheckedIOException
765     *             on parse error or input read-failure
766     */
767    public List<CSVRecord> getRecords() {
768        return stream().collect(Collectors.toList());
769    }
770
771    /**
772     * Gets the trailer comment, if any.
773     * Trailer comments are located between the last record and EOF
774     *
775     * @return the trailer comment for this stream, or null if no comment is available.
776     * @since 1.10.0
777     */
778    public String getTrailerComment() {
779        return trailerComment;
780    }
781
782    /**
783     * Handles whether the input is parsed as null
784     *
785     * @param input
786     *           the cell data to further processed
787     * @return null if input is parsed as null, or input itself if the input isn't parsed as null
788     */
789    private String handleNull(final String input) {
790        final boolean isQuoted = reusableToken.isQuoted;
791        final String nullString = format.getNullString();
792        final boolean strictQuoteMode = isStrictQuoteMode();
793        if (input.equals(nullString)) {
794            // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
795            return strictQuoteMode && isQuoted ? input : null;
796        }
797        // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
798        return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
799    }
800
801    /**
802     * Checks whether there is a header comment.
803     * The header comment appears before the header record.
804     * Note that if the parser's format has been given an explicit header
805     * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
806     * and the header record is not being skipped
807     * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
808     * will be associated with the first record, not the header.
809     *
810     * @return true if this parser has seen a header comment, false otherwise
811     * @since 1.10.0
812     */
813    public boolean hasHeaderComment() {
814        return headerComment != null;
815    }
816
817    /**
818     * Checks whether there is a trailer comment.
819     * Trailer comments are located between the last record and EOF.
820     * The trailer comments will only be available after the parser has
821     * finished processing this stream.
822     *
823     * @return true if this parser has seen a trailer comment, false otherwise
824     * @since 1.10.0
825     */
826    public boolean hasTrailerComment() {
827        return trailerComment != null;
828    }
829
830    /**
831     * Tests whether this parser is closed.
832     *
833     * @return whether this parser is closed.
834     */
835    public boolean isClosed() {
836        return lexer.isClosed();
837    }
838
839    /**
840     * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
841     *
842     * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
843     *         {@link QuoteMode#NON_NUMERIC}.
844     */
845    private boolean isStrictQuoteMode() {
846        return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
847               format.getQuoteMode() == QuoteMode.NON_NUMERIC;
848    }
849
850    /**
851     * Returns the record iterator.
852     *
853     * <p>
854     * An {@link IOException} caught during the iteration is re-thrown as an
855     * {@link IllegalStateException}.
856     * </p>
857     * <p>
858     * If the parser is closed, the iterator will not yield any more records.
859     * A call to {@link Iterator#hasNext()} will return {@code false} and
860     * a call to {@link Iterator#next()} will throw a
861     * {@link NoSuchElementException}.
862     * </p>
863     * <p>
864     * If it is necessary to construct an iterator which is usable after the
865     * parser is closed, one option is to extract all records as a list with
866     * {@link #getRecords()}, and return an iterator to that list.
867     * </p>
868     * <p>
869     * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows an Iterator produces.
870     * </p>
871     */
872    @Override
873    public Iterator<CSVRecord> iterator() {
874        return csvRecordIterator;
875    }
876
877    /**
878     * Parses the next record from the current point in the stream.
879     *
880     * @return the record as an array of values, or {@code null} if the end of the stream has been reached.
881     * @throws IOException  on parse error or input read-failure.
882     * @throws CSVException on invalid CSV input data.
883     */
884    CSVRecord nextRecord() throws IOException {
885        CSVRecord result = null;
886        recordList.clear();
887        StringBuilder sb = null;
888        final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
889        final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
890        do {
891            reusableToken.reset();
892            lexer.nextToken(reusableToken);
893            switch (reusableToken.type) {
894            case TOKEN:
895                addRecordValue(false);
896                break;
897            case EORECORD:
898                addRecordValue(true);
899                break;
900            case EOF:
901                if (reusableToken.isReady) {
902                    addRecordValue(true);
903                } else if (sb != null) {
904                    trailerComment = sb.toString();
905                }
906                break;
907            case INVALID:
908                throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber());
909            case COMMENT: // Ignored currently
910                if (sb == null) { // first comment for this record
911                    sb = new StringBuilder();
912                } else {
913                    sb.append(Constants.LF);
914                }
915                sb.append(reusableToken.content);
916                reusableToken.type = TOKEN; // Read another token
917                break;
918            default:
919                throw new CSVException("Unexpected Token type: %s", reusableToken.type);
920            }
921        } while (reusableToken.type == TOKEN);
922
923        if (!recordList.isEmpty()) {
924            recordNumber++;
925            final String comment = Objects.toString(sb, null);
926            result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
927                recordNumber, startCharPosition, startBytePosition);
928        }
929        return result;
930    }
931
932    /**
933     * Returns a sequential {@code Stream} with this collection as its source.
934     * <p>
935     * If the parser is closed, the stream will not produce any more values.
936     * See the comments in {@link #iterator()}.
937     * </p>
938     * <p>
939     * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows a Stream produces.
940     * </p>
941     *
942     * @return a sequential {@code Stream} with this collection as its source.
943     * @since 1.9.0
944     */
945    public Stream<CSVRecord> stream() {
946        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
947    }
948
949}