View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.csv.Token.Type.TOKEN;
23  
24  import java.io.Closeable;
25  import java.io.File;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.InputStreamReader;
29  import java.io.Reader;
30  import java.io.StringReader;
31  import java.io.UncheckedIOException;
32  import java.net.URL;
33  import java.nio.charset.Charset;
34  import java.nio.file.Files;
35  import java.nio.file.Path;
36  import java.util.ArrayList;
37  import java.util.Arrays;
38  import java.util.Collections;
39  import java.util.Iterator;
40  import java.util.LinkedHashMap;
41  import java.util.List;
42  import java.util.Map;
43  import java.util.NoSuchElementException;
44  import java.util.Objects;
45  import java.util.Spliterator;
46  import java.util.Spliterators;
47  import java.util.TreeMap;
48  import java.util.stream.Collectors;
49  import java.util.stream.Stream;
50  import java.util.stream.StreamSupport;
51  
52  import org.apache.commons.io.Charsets;
53  import org.apache.commons.io.build.AbstractStreamBuilder;
54  import org.apache.commons.io.function.Uncheck;
55  
56  /**
57   * Parses CSV files according to the specified format.
58   *
59   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
60   * specification of a {@link CSVFormat}.
61   *
62   * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
63   *
64   * <h2>Creating instances</h2>
65   * <p>
66   * There are several static factory methods that can be used to create instances for various types of resources:
67   * </p>
68   * <ul>
69   *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
70   *     <li>{@link #parse(String, CSVFormat)}</li>
71   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
72   * </ul>
73   * <p>
74   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
75   *
76   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
77   * </p>
78   * <pre>
79   * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
80   *     ...
81   * }
82   * </pre>
83   *
84   * <h2>Parsing record wise</h2>
85   * <p>
86   * To parse a CSV input from a file, you write:
87   * </p>
88   *
89   * <pre>{@code
90   * File csvData = new File("/path/to/csv");
91   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
92   * for (CSVRecord csvRecord : parser) {
93   *     ...
94   * }}
95   * </pre>
96   *
97   * <p>
98   * This will read the parse the contents of the file using the
99   * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
100  * </p>
101  *
102  * <p>
103  * To parse CSV input in a format like Excel, you write:
104  * </p>
105  *
106  * <pre>
107  * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
108  * for (CSVRecord csvRecord : parser) {
109  *     ...
110  * }
111  * </pre>
112  *
113  * <p>
114  * If the predefined formats don't match the format at hand, custom formats can be defined. More information about
115  * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
116  * </p>
117  *
118  * <h2>Parsing into memory</h2>
119  * <p>
120  * If parsing record-wise is not desired, the contents of the input can be read completely into memory.
121  * </p>
122  *
123  * <pre>{@code
124  * Reader in = new StringReader("a;b\nc;d");
125  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
126  * List<CSVRecord> list = parser.getRecords();
127  * }</pre>
128  *
129  * <p>
130  * There are two constraints that have to be kept in mind:
131  * </p>
132  *
133  * <ol>
134  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
135  *     the input, those records will not end up in the in-memory representation of your CSV data.</li>
136  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
137  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
138  * </ol>
139  *
140  * <h2>Notes</h2>
141  * <p>
142  * The internal parser state is completely covered by the format and the reader state.
143  * </p>
144  *
145  * @see <a href="package-summary.html">package documentation for more details</a>
146  */
147 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
148 
149     /**
150      * Builds a new {@link CSVParser}.
151      *
152      * @since 1.13.0
153      */
154     public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
155 
156         private CSVFormat format;
157         private long characterOffset;
158         private long recordNumber = 1;
159         private boolean trackBytes;
160 
161         /**
162          * Constructs a new instance.
163          */
164         protected Builder() {
165             // empty
166         }
167 
168         @SuppressWarnings("resource")
169         @Override
170         public CSVParser get() throws IOException {
171             return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
172         }
173 
174         /**
175          * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
176          *
177          * @param characterOffset the lexer offset.
178          * @return this instance.
179          */
180         public Builder setCharacterOffset(final long characterOffset) {
181             this.characterOffset = characterOffset;
182             return asThis();
183         }
184 
185         /**
186          * Sets the CSV format. A copy of the given format is kept.
187          *
188          * @param format the CSV format, {@code null} resets to {@link CSVFormat#DEFAULT}.
189          * @return this instance.
190          */
191         public Builder setFormat(final CSVFormat format) {
192             this.format = CSVFormat.copy(format);
193             return asThis();
194         }
195 
196         /**
197          * Sets the next record number to assign, defaults to {@code 1}.
198          *
199          * @param recordNumber the next record number to assign.
200          * @return this instance.
201          */
202         public Builder setRecordNumber(final long recordNumber) {
203             this.recordNumber = recordNumber;
204             return asThis();
205         }
206 
207         /**
208          * Sets whether to enable byte tracking for the parser.
209          *
210          * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
211          * @return this instance.
212          * @since 1.13.0
213          */
214         public Builder setTrackBytes(final boolean trackBytes) {
215             this.trackBytes = trackBytes;
216             return asThis();
217         }
218 
219     }
220 
221     final class CSVRecordIterator implements Iterator<CSVRecord> {
222         private CSVRecord current;
223 
224         /**
225          * Gets the next record or null at the end of stream or max rows read.
226          *
227          * @throws IOException  on parse error or input read-failure
228          * @throws CSVException on invalid input.
229          * @return the next record, or {@code null} if the end of the stream has been reached.
230          */
231         private CSVRecord getNextRecord() {
232             CSVRecord record = null;
233             if (format.useRow(recordNumber + 1)) {
234                 record = Uncheck.get(CSVParser.this::nextRecord);
235             }
236             return record;
237         }
238 
239         @Override
240         public boolean hasNext() {
241             if (isClosed()) {
242                 return false;
243             }
244             if (current == null) {
245                 current = getNextRecord();
246             }
247             return current != null;
248         }
249 
250         @Override
251         public CSVRecord next() {
252             if (isClosed()) {
253                 throw new NoSuchElementException("CSVParser has been closed");
254             }
255             CSVRecord next = current;
256             current = null;
257             if (next == null) {
258                 // hasNext() wasn't called before
259                 next = getNextRecord();
260                 if (next == null) {
261                     throw new NoSuchElementException("No more CSV records available");
262                 }
263             }
264             return next;
265         }
266 
267         @Override
268         public void remove() {
269             throw new UnsupportedOperationException();
270         }
271     }
272     /**
273      * Header information based on name and position.
274      */
275     private static final class Headers {
276 
277         /**
278          * Header column positions (0-based)
279          */
280         final Map<String, Integer> headerMap;
281 
282         /**
283          * Header names in column order
284          */
285         final List<String> headerNames;
286 
287         Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
288             this.headerMap = headerMap;
289             this.headerNames = headerNames;
290         }
291     }
292 
293     /**
294      * Creates a new builder.
295      *
296      * @return a new builder.
297      * @since 1.13.0
298      */
299     public static Builder builder() {
300         return new Builder();
301     }
302 
303     /**
304      * Creates a parser for the given {@link File}.
305      *
306      * @param file
307      *            a CSV file. Must not be null.
308      * @param charset
309      *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
310      * @param format
311      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
312      * @return a new parser
313      * @throws IllegalArgumentException
314      *             If the parameters of the format are inconsistent.
315      * @throws IOException
316      *             If an I/O error occurs
317      * @throws CSVException Thrown on invalid CSV input data.
318      * @throws NullPointerException if {@code file} is {@code null}.
319      */
320     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
321         Objects.requireNonNull(file, "file");
322         return parse(file.toPath(), charset, format);
323     }
324 
325     /**
326      * Creates a CSV parser using the given {@link CSVFormat}.
327      *
328      * <p>
329      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
330      * unless you close the {@code reader}.
331      * </p>
332      *
333      * @param inputStream
334      *            an InputStream containing CSV-formatted input, {@code null} maps to {@link CSVFormat#DEFAULT}.
335      * @param charset
336      *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
337      * @param format
338      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
339      * @return a new CSVParser configured with the given reader and format.
340      * @throws IllegalArgumentException
341      *             If the parameters of the format are inconsistent or if either reader or format are null.
342      * @throws IOException
343      *             If there is a problem reading the header or skipping the first record
344      * @throws CSVException Thrown on invalid CSV input data.
345      * @since 1.5
346      */
347     public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
348             throws IOException {
349         return parse(new InputStreamReader(inputStream, Charsets.toCharset(charset)), format);
350     }
351 
352     /**
353      * Creates and returns a parser for the given {@link Path}, which the caller MUST close.
354      *
355      * @param path
356      *            a CSV file. Must not be null.
357      * @param charset
358      *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
359      * @param format
360      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
361      * @return a new parser
362      * @throws IllegalArgumentException
363      *             If the parameters of the format are inconsistent.
364      * @throws IOException
365      *             If an I/O error occurs
366      * @throws CSVException Thrown on invalid CSV input data.
367      * @throws NullPointerException if {@code path} is {@code null}.
368      * @since 1.5
369      */
370     @SuppressWarnings("resource")
371     public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
372         Objects.requireNonNull(path, "path");
373         return parse(Files.newInputStream(path), charset, format);
374     }
375 
376     /**
377      * Creates a CSV parser using the given {@link CSVFormat}
378      *
379      * <p>
380      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
381      * unless you close the {@code reader}.
382      * </p>
383      *
384      * @param reader
385      *            a Reader containing CSV-formatted input. Must not be null.
386      * @param format
387      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
388      * @return a new CSVParser configured with the given reader and format.
389      * @throws IllegalArgumentException
390      *             If the parameters of the format are inconsistent or if either reader or format are null.
391      * @throws IOException
392      *             If there is a problem reading the header or skipping the first record
393      * @throws CSVException Thrown on invalid CSV input data.
394      * @since 1.5
395      */
396     public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
397         return builder().setReader(reader).setFormat(format).get();
398     }
399 
400     /**
401      * Creates a parser for the given {@link String}.
402      *
403      * @param string
404      *            a CSV string. Must not be null.
405      * @param format
406      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
407      * @return a new parser
408      * @throws IllegalArgumentException
409      *             If the parameters of the format are inconsistent.
410      * @throws IOException
411      *             If an I/O error occurs
412      * @throws CSVException Thrown on invalid CSV input data.
413      * @throws NullPointerException if {@code string} is {@code null}.
414      */
415     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
416         Objects.requireNonNull(string, "string");
417         return parse(new StringReader(string), format);
418     }
419 
420     /**
421      * Creates and returns a parser for the given URL, which the caller MUST close.
422      *
423      * <p>
424      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
425      * you close the {@code url}.
426      * </p>
427      *
428      * @param url
429      *            a URL. Must not be null.
430      * @param charset
431      *            the charset for the resource, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
432      * @param format
433      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
434      * @return a new parser
435      * @throws IllegalArgumentException
436      *             If the parameters of the format are inconsistent.
437      * @throws IOException
438      *             If an I/O error occurs
439      * @throws CSVException Thrown on invalid CSV input data.
440      * @throws NullPointerException if {@code url} is {@code null}.
441      */
442     @SuppressWarnings("resource")
443     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
444         Objects.requireNonNull(url, "url");
445         return parse(url.openStream(), charset, format);
446     }
447 
448     private String headerComment;
449 
450     private String trailerComment;
451 
452     private final CSVFormat format;
453 
454     private final Headers headers;
455 
456     private final Lexer lexer;
457 
458     private final CSVRecordIterator csvRecordIterator;
459 
460     /** A record buffer for getRecord(). Grows as necessary and is reused. */
461     private final List<String> recordList = new ArrayList<>();
462 
463     /**
464      * The next record number to assign.
465      */
466     private long recordNumber;
467 
468     /**
469      * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
470      * with {@link #recordNumber}.
471      */
472     private final long characterOffset;
473 
474     private final Token reusableToken = new Token();
475 
476     /**
477      * Constructs a new instance using the given {@link CSVFormat}.
478      *
479      * <p>
480      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
481      * unless you close the {@code reader}.
482      * </p>
483      *
484      * @param reader
485      *            a Reader containing CSV-formatted input. Must not be null.
486      * @param format
487      *            the CSVFormat used for CSV parsing. Must not be null.
488      * @throws IllegalArgumentException
489      *             If the parameters of the format are inconsistent or if either reader or format are null.
490      * @throws IOException
491      *             If there is a problem reading the header or skipping the first record
492      * @throws CSVException Thrown on invalid CSV input data.
493      * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
494      */
495     @Deprecated
496     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
497         this(reader, format, 0, 1);
498     }
499 
500     /**
501      * Constructs a new instance using the given {@link CSVFormat}.
502      *
503      * <p>
504      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
505      * unless you close the {@code reader}.
506      * </p>
507      *
508      * @param reader
509      *            a Reader containing CSV-formatted input. Must not be null.
510      * @param format
511      *            the CSVFormat used for CSV parsing. Must not be null.
512      * @param characterOffset
513      *            Lexer offset when the parser does not start parsing at the beginning of the source.
514      * @param recordNumber
515      *            The next record number to assign.
516      * @throws IllegalArgumentException
517      *             If the parameters of the format are inconsistent or if either the reader or format is null.
518      * @throws IOException
519      *             if there is a problem reading the header or skipping the first record
520      * @throws CSVException on invalid input.
521      * @since 1.1
522      * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
523      */
524     @Deprecated
525     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException {
526         this(reader, format, characterOffset, recordNumber, null, false);
527     }
528 
529     /**
530      * Constructs a new instance using the given {@link CSVFormat}.
531      *
532      * <p>
533      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
534      * unless you close the {@code reader}.
535      * </p>
536      *
537      * @param reader
538      *            a Reader containing CSV-formatted input. Must not be null.
539      * @param format
540      *            the CSVFormat used for CSV parsing. Must not be null.
541      * @param characterOffset
542      *            Lexer offset when the parser does not start parsing at the beginning of the source.
543      * @param recordNumber
544      *            The next record number to assign.
545      * @param charset
546      *            The character encoding to be used for the reader when enableByteTracking is true.
547      * @param trackBytes
548      *           {@code true} to enable byte tracking for the parser; {@code false} to disable it.
549      * @throws IllegalArgumentException
550      *             If the parameters of the format are inconsistent or if either the reader or format is null.
551      * @throws IOException
552      *             If there is a problem reading the header or skipping the first record.
553      * @throws CSVException Thrown on invalid CSV input data.
554      */
555     private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
556         final Charset charset, final boolean trackBytes)
557         throws IOException {
558         Objects.requireNonNull(reader, "reader");
559         Objects.requireNonNull(format, "format");
560         this.format = format.copy();
561         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
562         this.csvRecordIterator = new CSVRecordIterator();
563         this.headers = createHeaders();
564         this.characterOffset = characterOffset;
565         this.recordNumber = recordNumber - 1;
566     }
567 
568     private void addRecordValue(final boolean lastRecord) {
569         final String input = format.trim(reusableToken.content.toString());
570         if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
571             return;
572         }
573         recordList.add(handleNull(input));
574     }
575 
576     /**
577      * Closes resources.
578      *
579      * @throws IOException
580      *             If an I/O error occurs
581      */
582     @Override
583     public void close() throws IOException {
584         lexer.close();
585     }
586 
587     private Map<String, Integer> createEmptyHeaderMap() {
588         return format.getIgnoreHeaderCase() ?
589                 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
590                 new LinkedHashMap<>();
591     }
592 
593     /**
594      * Creates the name to index mapping if the format defines a header.
595      *
596      * @return null if the format has no header.
597      * @throws IOException if there is a problem reading the header or skipping the first record
598      * @throws CSVException on invalid input.
599      */
600     private Headers createHeaders() throws IOException {
601         Map<String, Integer> headerMap = null;
602         List<String> headerNames = null;
603         final String[] formatHeader = format.getHeader();
604         if (formatHeader != null) {
605             headerMap = createEmptyHeaderMap();
606             String[] headerRecord = null;
607             if (formatHeader.length == 0) {
608                 // read the header from the first line of the file
609                 final CSVRecord nextRecord = nextRecord();
610                 if (nextRecord != null) {
611                     headerRecord = nextRecord.values();
612                     headerComment = nextRecord.getComment();
613                 }
614             } else {
615                 if (format.getSkipHeaderRecord()) {
616                     final CSVRecord nextRecord = nextRecord();
617                     if (nextRecord != null) {
618                         headerComment = nextRecord.getComment();
619                     }
620                 }
621                 headerRecord = formatHeader;
622             }
623             // build the name to index mappings
624             if (headerRecord != null) {
625                 // Track an occurrence of a null, empty or blank header.
626                 boolean observedMissing = false;
627                 for (int i = 0; i < headerRecord.length; i++) {
628                     final String header = headerRecord[i];
629                     final boolean blankHeader = CSVFormat.isBlank(header);
630                     if (blankHeader && !format.getAllowMissingColumnNames()) {
631                         throw new IllegalArgumentException("A header name is missing in " + Arrays.toString(headerRecord));
632                     }
633                     final boolean containsHeader = blankHeader ? observedMissing : headerMap.containsKey(header);
634                     final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
635                     final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
636                     final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
637                     if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
638                         throw new IllegalArgumentException(String.format(
639                                 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
640                                 header, Arrays.toString(headerRecord)));
641                     }
642                     observedMissing |= blankHeader;
643                     if (header != null) {
644                         headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional
645                         if (headerNames == null) {
646                             headerNames = new ArrayList<>(headerRecord.length);
647                         }
648                         headerNames.add(header);
649                     }
650                 }
651             }
652         }
653         // Make header names Collection immutable
654         return new Headers(headerMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
655     }
656 
657     /**
658      * Gets the current line number in the input stream.
659      *
660      * <p>
661      * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
662      * the record number.
663      * </p>
664      *
665      * @return current line number.
666      */
667     public long getCurrentLineNumber() {
668         return lexer.getCurrentLineNumber();
669     }
670 
671     /**
672      * Gets the first end-of-line string encountered.
673      *
674      * @return the first end-of-line string.
675      * @since 1.5
676      */
677     public String getFirstEndOfLine() {
678         return lexer.getFirstEol();
679     }
680 
681     /**
682      * Gets the header comment, if any.
683      * The header comment appears before the header record.
684      *
685      * @return the header comment for this stream, or null if no comment is available.
686      * @since 1.10.0
687      */
688     public String getHeaderComment() {
689         return headerComment;
690     }
691 
692     /**
693      * Gets a copy of the header map as defined in the CSVFormat's header.
694      * <p>
695      * The map keys are column names. The map values are 0-based indices.
696      * </p>
697      * <p>
698      * <strong>Note:</strong> The map can only provide a one-to-one mapping when the format did not
699      * contain null or duplicate column names.
700      * </p>
701      *
702      * @return a copy of the header map.
703      */
704     public Map<String, Integer> getHeaderMap() {
705         if (headers.headerMap == null) {
706             return null;
707         }
708         final Map<String, Integer> map = createEmptyHeaderMap();
709         map.putAll(headers.headerMap);
710         return map;
711     }
712 
713     /**
714      * Gets the underlying header map.
715      *
716      * @return the underlying header map.
717      */
718     Map<String, Integer> getHeaderMapRaw() {
719         return headers.headerMap;
720     }
721 
722     /**
723      * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
724      * <p>
725      * Note: The list provides strings that can be used as keys in the header map.
726      * The list will not contain null column names if they were present in the input
727      * format.
728      * </p>
729      *
730      * @return read-only list of header names that iterates in column order.
731      * @see #getHeaderMap()
732      * @since 1.7
733      */
734     public List<String> getHeaderNames() {
735         return Collections.unmodifiableList(headers.headerNames);
736     }
737 
738     /**
739      * Gets the current record number in the input stream.
740      *
741      * <p>
742      * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
743      * the line number.
744      * </p>
745      *
746      * @return current record number
747      */
748     public long getRecordNumber() {
749         return recordNumber;
750     }
751 
752     /**
753      * Parses the CSV input according to the given format and returns the content as a list of
754      * {@link CSVRecord CSVRecords}.
755      *
756      * <p>
757      * The returned content starts at the current parse-position in the stream.
758      * </p>
759      * <p>
760      * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows this method produces.
761      * </p>
762      *
763      * @return list of {@link CSVRecord CSVRecords}, may be empty
764      * @throws UncheckedIOException
765      *             on parse error or input read-failure
766      */
767     public List<CSVRecord> getRecords() {
768         return stream().collect(Collectors.toList());
769     }
770 
771     /**
772      * Gets the trailer comment, if any.
773      * Trailer comments are located between the last record and EOF
774      *
775      * @return the trailer comment for this stream, or null if no comment is available.
776      * @since 1.10.0
777      */
778     public String getTrailerComment() {
779         return trailerComment;
780     }
781 
782     /**
783      * Handles whether the input is parsed as null
784      *
785      * @param input
786      *           the cell data to further processed
787      * @return null if input is parsed as null, or input itself if the input isn't parsed as null
788      */
789     private String handleNull(final String input) {
790         final boolean isQuoted = reusableToken.isQuoted;
791         final String nullString = format.getNullString();
792         final boolean strictQuoteMode = isStrictQuoteMode();
793         if (input.equals(nullString)) {
794             // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
795             return strictQuoteMode && isQuoted ? input : null;
796         }
797         // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
798         return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
799     }
800 
801     /**
802      * Checks whether there is a header comment.
803      * The header comment appears before the header record.
804      * Note that if the parser's format has been given an explicit header
805      * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
806      * and the header record is not being skipped
807      * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
808      * will be associated with the first record, not the header.
809      *
810      * @return true if this parser has seen a header comment, false otherwise
811      * @since 1.10.0
812      */
813     public boolean hasHeaderComment() {
814         return headerComment != null;
815     }
816 
817     /**
818      * Checks whether there is a trailer comment.
819      * Trailer comments are located between the last record and EOF.
820      * The trailer comments will only be available after the parser has
821      * finished processing this stream.
822      *
823      * @return true if this parser has seen a trailer comment, false otherwise
824      * @since 1.10.0
825      */
826     public boolean hasTrailerComment() {
827         return trailerComment != null;
828     }
829 
830     /**
831      * Tests whether this parser is closed.
832      *
833      * @return whether this parser is closed.
834      */
835     public boolean isClosed() {
836         return lexer.isClosed();
837     }
838 
839     /**
840      * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
841      *
842      * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
843      *         {@link QuoteMode#NON_NUMERIC}.
844      */
845     private boolean isStrictQuoteMode() {
846         return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
847                format.getQuoteMode() == QuoteMode.NON_NUMERIC;
848     }
849 
850     /**
851      * Returns the record iterator.
852      *
853      * <p>
854      * An {@link IOException} caught during the iteration is re-thrown as an
855      * {@link IllegalStateException}.
856      * </p>
857      * <p>
858      * If the parser is closed, the iterator will not yield any more records.
859      * A call to {@link Iterator#hasNext()} will return {@code false} and
860      * a call to {@link Iterator#next()} will throw a
861      * {@link NoSuchElementException}.
862      * </p>
863      * <p>
864      * If it is necessary to construct an iterator which is usable after the
865      * parser is closed, one option is to extract all records as a list with
866      * {@link #getRecords()}, and return an iterator to that list.
867      * </p>
868      * <p>
869      * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows an Iterator produces.
870      * </p>
871      */
872     @Override
873     public Iterator<CSVRecord> iterator() {
874         return csvRecordIterator;
875     }
876 
877     /**
878      * Parses the next record from the current point in the stream.
879      *
880      * @return the record as an array of values, or {@code null} if the end of the stream has been reached.
881      * @throws IOException  on parse error or input read-failure.
882      * @throws CSVException on invalid CSV input data.
883      */
884     CSVRecord nextRecord() throws IOException {
885         CSVRecord result = null;
886         recordList.clear();
887         StringBuilder sb = null;
888         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
889         final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
890         do {
891             reusableToken.reset();
892             lexer.nextToken(reusableToken);
893             switch (reusableToken.type) {
894             case TOKEN:
895                 addRecordValue(false);
896                 break;
897             case EORECORD:
898                 addRecordValue(true);
899                 break;
900             case EOF:
901                 if (reusableToken.isReady) {
902                     addRecordValue(true);
903                 } else if (sb != null) {
904                     trailerComment = sb.toString();
905                 }
906                 break;
907             case INVALID:
908                 throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber());
909             case COMMENT: // Ignored currently
910                 if (sb == null) { // first comment for this record
911                     sb = new StringBuilder();
912                 } else {
913                     sb.append(Constants.LF);
914                 }
915                 sb.append(reusableToken.content);
916                 reusableToken.type = TOKEN; // Read another token
917                 break;
918             default:
919                 throw new CSVException("Unexpected Token type: %s", reusableToken.type);
920             }
921         } while (reusableToken.type == TOKEN);
922 
923         if (!recordList.isEmpty()) {
924             recordNumber++;
925             final String comment = Objects.toString(sb, null);
926             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
927                 recordNumber, startCharPosition, startBytePosition);
928         }
929         return result;
930     }
931 
932     /**
933      * Returns a sequential {@code Stream} with this collection as its source.
934      * <p>
935      * If the parser is closed, the stream will not produce any more values.
936      * See the comments in {@link #iterator()}.
937      * </p>
938      * <p>
939      * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows a Stream produces.
940      * </p>
941      *
942      * @return a sequential {@code Stream} with this collection as its source.
943      * @since 1.9.0
944      */
945     public Stream<CSVRecord> stream() {
946         return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
947     }
948 
949 }