View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.csv.Token.Type.TOKEN;
23  
24  import java.io.Closeable;
25  import java.io.File;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.InputStreamReader;
29  import java.io.Reader;
30  import java.io.StringReader;
31  import java.io.UncheckedIOException;
32  import java.net.URL;
33  import java.nio.charset.Charset;
34  import java.nio.file.Files;
35  import java.nio.file.Path;
36  import java.util.ArrayList;
37  import java.util.Arrays;
38  import java.util.Collections;
39  import java.util.Iterator;
40  import java.util.LinkedHashMap;
41  import java.util.List;
42  import java.util.Map;
43  import java.util.NoSuchElementException;
44  import java.util.Objects;
45  import java.util.Spliterator;
46  import java.util.Spliterators;
47  import java.util.TreeMap;
48  import java.util.stream.Collectors;
49  import java.util.stream.Stream;
50  import java.util.stream.StreamSupport;
51  
52  import org.apache.commons.io.build.AbstractStreamBuilder;
53  import org.apache.commons.io.function.Uncheck;
54  
55  /**
56   * Parses CSV files according to the specified format.
57   *
58   * Because CSV appears in many different dialects, the parser supports many formats by allowing the
59   * specification of a {@link CSVFormat}.
60   *
61   * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
62   *
63   * <h2>Creating instances</h2>
64   * <p>
65   * There are several static factory methods that can be used to create instances for various types of resources:
66   * </p>
67   * <ul>
68   *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
69   *     <li>{@link #parse(String, CSVFormat)}</li>
70   *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
71   * </ul>
72   * <p>
73   * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
74   *
75   * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
76   * </p>
77   * <pre>
78   * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
79   *     ...
80   * }
81   * </pre>
82   *
83   * <h2>Parsing record wise</h2>
84   * <p>
85   * To parse a CSV input from a file, you write:
86   * </p>
87   *
88   * <pre>{@code
89   * File csvData = new File("/path/to/csv");
90   * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
91   * for (CSVRecord csvRecord : parser) {
92   *     ...
93   * }}
94   * </pre>
95   *
96   * <p>
97   * This will read the parse the contents of the file using the
98   * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
99   * </p>
100  *
101  * <p>
102  * To parse CSV input in a format like Excel, you write:
103  * </p>
104  *
105  * <pre>
106  * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
107  * for (CSVRecord csvRecord : parser) {
108  *     ...
109  * }
110  * </pre>
111  *
112  * <p>
113  * If the predefined formats don't match the format at hand, custom formats can be defined. More information about
114  * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
115  * </p>
116  *
117  * <h2>Parsing into memory</h2>
118  * <p>
119  * If parsing record-wise is not desired, the contents of the input can be read completely into memory.
120  * </p>
121  *
122  * <pre>{@code
123  * Reader in = new StringReader("a;b\nc;d");
124  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
125  * List<CSVRecord> list = parser.getRecords();
126  * }</pre>
127  *
128  * <p>
129  * There are two constraints that have to be kept in mind:
130  * </p>
131  *
132  * <ol>
133  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
134  *     the input, those records will not end up in the in-memory representation of your CSV data.</li>
135  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
136  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
137  * </ol>
138  *
139  * <h2>Notes</h2>
140  * <p>
141  * The internal parser state is completely covered by the format and the reader state.
142  * </p>
143  *
144  * @see <a href="package-summary.html">package documentation for more details</a>
145  */
146 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
147 
148     /**
149      * Builds a new {@link CSVParser}.
150      *
151      * @since 1.13.0
152      */
153     public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
154 
155         private CSVFormat format;
156         private long characterOffset;
157         private long recordNumber = 1;
158         private boolean trackBytes;
159 
160         /**
161          * Constructs a new instance.
162          */
163         protected Builder() {
164             // empty
165         }
166 
167         @SuppressWarnings("resource")
168         @Override
169         public CSVParser get() throws IOException {
170             return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
171         }
172 
173         /**
174          * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
175          *
176          * @param characterOffset the lexer offset.
177          * @return this instance.
178          */
179         public Builder setCharacterOffset(final long characterOffset) {
180             this.characterOffset = characterOffset;
181             return asThis();
182         }
183 
184         /**
185          * Sets the CSV format. A copy of the given format is kept.
186          *
187          * @param format the CSV format, null is equivalent to {@link CSVFormat#DEFAULT}.
188          * @return this instance.
189          */
190         public Builder setFormat(final CSVFormat format) {
191             this.format = CSVFormat.copy(format);
192             return asThis();
193         }
194 
195         /**
196          * Sets the next record number to assign, defaults to {@code 1}.
197          *
198          * @param recordNumber the next record number to assign.
199          * @return this instance.
200          */
201         public Builder setRecordNumber(final long recordNumber) {
202             this.recordNumber = recordNumber;
203             return asThis();
204         }
205 
206         /**
207          * Sets whether to enable byte tracking for the parser.
208          *
209          * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
210          * @return this instance.
211          * @since 1.13.0
212          */
213         public Builder setTrackBytes(final boolean trackBytes) {
214             this.trackBytes = trackBytes;
215             return asThis();
216         }
217 
218     }
219 
220     final class CSVRecordIterator implements Iterator<CSVRecord> {
221         private CSVRecord current;
222 
223         /**
224          * Gets the next record.
225          *
226          * @throws IOException  on parse error or input read-failure
227          * @throws CSVException on invalid input.
228          * @return the next record.
229          */
230         private CSVRecord getNextRecord() {
231             return Uncheck.get(CSVParser.this::nextRecord);
232         }
233 
234         @Override
235         public boolean hasNext() {
236             if (isClosed()) {
237                 return false;
238             }
239             if (current == null) {
240                 current = getNextRecord();
241             }
242 
243             return current != null;
244         }
245 
246         @Override
247         public CSVRecord next() {
248             if (isClosed()) {
249                 throw new NoSuchElementException("CSVParser has been closed");
250             }
251             CSVRecord next = current;
252             current = null;
253 
254             if (next == null) {
255                 // hasNext() wasn't called before
256                 next = getNextRecord();
257                 if (next == null) {
258                     throw new NoSuchElementException("No more CSV records available");
259                 }
260             }
261 
262             return next;
263         }
264 
265         @Override
266         public void remove() {
267             throw new UnsupportedOperationException();
268         }
269     }
270     /**
271      * Header information based on name and position.
272      */
273     private static final class Headers {
274 
275         /**
276          * Header column positions (0-based)
277          */
278         final Map<String, Integer> headerMap;
279 
280         /**
281          * Header names in column order
282          */
283         final List<String> headerNames;
284 
285         Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
286             this.headerMap = headerMap;
287             this.headerNames = headerNames;
288         }
289     }
290 
291     /**
292      * Creates a new builder.
293      *
294      * @return a new builder.
295      * @since 1.13.0
296      */
297     public static Builder builder() {
298         return new Builder();
299     }
300 
301     /**
302      * Creates a parser for the given {@link File}.
303      *
304      * @param file
305      *            a CSV file. Must not be null.
306      * @param charset
307      *            The Charset to decode the given file.
308      * @param format
309      *            the CSVFormat used for CSV parsing. Must not be null.
310      * @return a new parser
311      * @throws IllegalArgumentException
312      *             If the parameters of the format are inconsistent or if either file or format are null.
313      * @throws IOException
314      *             If an I/O error occurs
315      * @throws CSVException Thrown on invalid input.
316      */
317     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
318         Objects.requireNonNull(file, "file");
319         return parse(file.toPath(), charset, format);
320     }
321 
322     /**
323      * Creates a CSV parser using the given {@link CSVFormat}.
324      *
325      * <p>
326      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
327      * unless you close the {@code reader}.
328      * </p>
329      *
330      * @param inputStream
331      *            an InputStream containing CSV-formatted input. Must not be null.
332      * @param charset
333      *            The Charset to decode the given file.
334      * @param format
335      *            the CSVFormat used for CSV parsing. Must not be null.
336      * @return a new CSVParser configured with the given reader and format.
337      * @throws IllegalArgumentException
338      *             If the parameters of the format are inconsistent or if either reader or format are null.
339      * @throws IOException
340      *             If there is a problem reading the header or skipping the first record
341      * @throws CSVException Thrown on invalid input.
342      * @since 1.5
343      */
344     @SuppressWarnings("resource")
345     public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
346             throws IOException {
347         Objects.requireNonNull(inputStream, "inputStream");
348         Objects.requireNonNull(format, "format");
349         return parse(new InputStreamReader(inputStream, charset), format);
350     }
351 
352     /**
353      * Creates and returns a parser for the given {@link Path}, which the caller MUST close.
354      *
355      * @param path
356      *            a CSV file. Must not be null.
357      * @param charset
358      *            The Charset to decode the given file.
359      * @param format
360      *            the CSVFormat used for CSV parsing. Must not be null.
361      * @return a new parser
362      * @throws IllegalArgumentException
363      *             If the parameters of the format are inconsistent or if either file or format are null.
364      * @throws IOException
365      *             If an I/O error occurs
366      * @throws CSVException Thrown on invalid input.
367      * @since 1.5
368      */
369     @SuppressWarnings("resource")
370     public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
371         Objects.requireNonNull(path, "path");
372         Objects.requireNonNull(format, "format");
373         return parse(Files.newInputStream(path), charset, format);
374     }
375 
376     /**
377      * Creates a CSV parser using the given {@link CSVFormat}
378      *
379      * <p>
380      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
381      * unless you close the {@code reader}.
382      * </p>
383      *
384      * @param reader
385      *            a Reader containing CSV-formatted input. Must not be null.
386      * @param format
387      *            the CSVFormat used for CSV parsing. Must not be null.
388      * @return a new CSVParser configured with the given reader and format.
389      * @throws IllegalArgumentException
390      *             If the parameters of the format are inconsistent or if either reader or format are null.
391      * @throws IOException
392      *             If there is a problem reading the header or skipping the first record
393      * @throws CSVException Thrown on invalid input.
394      * @since 1.5
395      */
396     public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
397         return builder().setReader(reader).setFormat(format).get();
398     }
399 
400     /**
401      * Creates a parser for the given {@link String}.
402      *
403      * @param string
404      *            a CSV string. Must not be null.
405      * @param format
406      *            the CSVFormat used for CSV parsing. Must not be null.
407      * @return a new parser
408      * @throws IllegalArgumentException
409      *             If the parameters of the format are inconsistent or if either string or format are null.
410      * @throws IOException
411      *             If an I/O error occurs
412      * @throws CSVException Thrown on invalid input.
413      */
414     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
415         Objects.requireNonNull(string, "string");
416         Objects.requireNonNull(format, "format");
417         return parse(new StringReader(string), format);
418     }
419 
420     /**
421      * Creates and returns a parser for the given URL, which the caller MUST close.
422      *
423      * <p>
424      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
425      * you close the {@code url}.
426      * </p>
427      *
428      * @param url
429      *            a URL. Must not be null.
430      * @param charset
431      *            the charset for the resource. Must not be null.
432      * @param format
433      *            the CSVFormat used for CSV parsing. Must not be null.
434      * @return a new parser
435      * @throws IllegalArgumentException
436      *             If the parameters of the format are inconsistent or if either url, charset or format are null.
437      * @throws IOException
438      *             If an I/O error occurs
439      * @throws CSVException Thrown on invalid input.
440      */
441     @SuppressWarnings("resource")
442     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
443         Objects.requireNonNull(url, "url");
444         return parse(url.openStream(), charset, format);
445     }
446 
447     private String headerComment;
448 
449     private String trailerComment;
450 
451     private final CSVFormat format;
452 
453     private final Headers headers;
454 
455     private final Lexer lexer;
456 
457     private final CSVRecordIterator csvRecordIterator;
458 
459     /** A record buffer for getRecord(). Grows as necessary and is reused. */
460     private final List<String> recordList = new ArrayList<>();
461 
462     /**
463      * The next record number to assign.
464      */
465     private long recordNumber;
466 
467     /**
468      * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
469      * with {@link #recordNumber}.
470      */
471     private final long characterOffset;
472 
473     private final Token reusableToken = new Token();
474 
475     /**
476      * Constructs a new instance using the given {@link CSVFormat}
477      *
478      * <p>
479      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
480      * unless you close the {@code reader}.
481      * </p>
482      *
483      * @param reader
484      *            a Reader containing CSV-formatted input. Must not be null.
485      * @param format
486      *            the CSVFormat used for CSV parsing. Must not be null.
487      * @throws IllegalArgumentException
488      *             If the parameters of the format are inconsistent or if either reader or format are null.
489      * @throws IOException
490      *             If there is a problem reading the header or skipping the first record
491      * @throws CSVException Thrown on invalid input.
492      * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
493      */
494     @Deprecated
495     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
496         this(reader, format, 0, 1);
497     }
498 
499     /**
500      * Constructs a new instance using the given {@link CSVFormat}
501      *
502      * <p>
503      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
504      * unless you close the {@code reader}.
505      * </p>
506      *
507      * @param reader
508      *            a Reader containing CSV-formatted input. Must not be null.
509      * @param format
510      *            the CSVFormat used for CSV parsing. Must not be null.
511      * @param characterOffset
512      *            Lexer offset when the parser does not start parsing at the beginning of the source.
513      * @param recordNumber
514      *            The next record number to assign.
515      * @throws IllegalArgumentException
516      *             If the parameters of the format are inconsistent or if either the reader or format is null.
517      * @throws IOException
518      *             if there is a problem reading the header or skipping the first record
519      * @throws CSVException on invalid input.
520      * @since 1.1
521      * @deprecated Will be private in the next major version, use {@link Builder#get()}.
522      */
523     @Deprecated
524     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
525         throws IOException {
526             this(reader, format, characterOffset, recordNumber, null, false);
527         }
528 
529     /**
530      * Constructs a new instance using the given {@link CSVFormat}
531      *
532      * <p>
533      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
534      * unless you close the {@code reader}.
535      * </p>
536      *
537      * @param reader
538      *            a Reader containing CSV-formatted input. Must not be null.
539      * @param format
540      *            the CSVFormat used for CSV parsing. Must not be null.
541      * @param characterOffset
542      *            Lexer offset when the parser does not start parsing at the beginning of the source.
543      * @param recordNumber
544      *            The next record number to assign.
545      * @param charset
546      *            The character encoding to be used for the reader when enableByteTracking is true.
547      * @param trackBytes
548      *           {@code true} to enable byte tracking for the parser; {@code false} to disable it.
549      * @throws IllegalArgumentException
550      *             If the parameters of the format are inconsistent or if either the reader or format is null.
551      * @throws IOException
552      *             If there is a problem reading the header or skipping the first record.
553      * @throws CSVException Thrown on invalid input.
554      */
555     private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
556         final Charset charset, final boolean trackBytes)
557         throws IOException {
558         Objects.requireNonNull(reader, "reader");
559         Objects.requireNonNull(format, "format");
560         this.format = format.copy();
561         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
562         this.csvRecordIterator = new CSVRecordIterator();
563         this.headers = createHeaders();
564         this.characterOffset = characterOffset;
565         this.recordNumber = recordNumber - 1;
566     }
567 
568     private void addRecordValue(final boolean lastRecord) {
569         final String input = format.trim(reusableToken.content.toString());
570         if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
571             return;
572         }
573         recordList.add(handleNull(input));
574     }
575 
576     /**
577      * Closes resources.
578      *
579      * @throws IOException
580      *             If an I/O error occurs
581      */
582     @Override
583     public void close() throws IOException {
584         lexer.close();
585     }
586 
587     private Map<String, Integer> createEmptyHeaderMap() {
588         return format.getIgnoreHeaderCase() ?
589                 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
590                 new LinkedHashMap<>();
591     }
592 
593     /**
594      * Creates the name to index mapping if the format defines a header.
595      *
596      * @return null if the format has no header.
597      * @throws IOException if there is a problem reading the header or skipping the first record
598      * @throws CSVException on invalid input.
599      */
600     private Headers createHeaders() throws IOException {
601         Map<String, Integer> hdrMap = null;
602         List<String> headerNames = null;
603         final String[] formatHeader = format.getHeader();
604         if (formatHeader != null) {
605             hdrMap = createEmptyHeaderMap();
606             String[] headerRecord = null;
607             if (formatHeader.length == 0) {
608                 // read the header from the first line of the file
609                 final CSVRecord nextRecord = nextRecord();
610                 if (nextRecord != null) {
611                     headerRecord = nextRecord.values();
612                     headerComment = nextRecord.getComment();
613                 }
614             } else {
615                 if (format.getSkipHeaderRecord()) {
616                     final CSVRecord nextRecord = nextRecord();
617                     if (nextRecord != null) {
618                         headerComment = nextRecord.getComment();
619                     }
620                 }
621                 headerRecord = formatHeader;
622             }
623 
624             // build the name to index mappings
625             if (headerRecord != null) {
626                 // Track an occurrence of a null, empty or blank header.
627                 boolean observedMissing = false;
628                 for (int i = 0; i < headerRecord.length; i++) {
629                     final String header = headerRecord[i];
630                     final boolean blankHeader = CSVFormat.isBlank(header);
631                     if (blankHeader && !format.getAllowMissingColumnNames()) {
632                         throw new IllegalArgumentException(
633                             "A header name is missing in " + Arrays.toString(headerRecord));
634                     }
635 
636                     final boolean containsHeader = blankHeader ? observedMissing : hdrMap.containsKey(header);
637                     final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
638                     final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
639                     final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
640 
641                     if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
642                         throw new IllegalArgumentException(
643                             String.format(
644                                 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
645                                 header, Arrays.toString(headerRecord)));
646                     }
647                     observedMissing |= blankHeader;
648                     if (header != null) {
649                         hdrMap.put(header, Integer.valueOf(i)); // N.B. Explicit (un)boxing is intentional
650                         if (headerNames == null) {
651                             headerNames = new ArrayList<>(headerRecord.length);
652                         }
653                         headerNames.add(header);
654                     }
655                 }
656             }
657         }
658         // Make header names Collection immutable
659         return new Headers(hdrMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
660     }
661 
662     /**
663      * Gets the current line number in the input stream.
664      *
665      * <p>
666      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
667      * the record number.
668      * </p>
669      *
670      * @return current line number
671      */
672     public long getCurrentLineNumber() {
673         return lexer.getCurrentLineNumber();
674     }
675 
676     /**
677      * Gets the first end-of-line string encountered.
678      *
679      * @return the first end-of-line string
680      * @since 1.5
681      */
682     public String getFirstEndOfLine() {
683         return lexer.getFirstEol();
684     }
685 
686     /**
687      * Gets the header comment, if any.
688      * The header comment appears before the header record.
689      *
690      * @return the header comment for this stream, or null if no comment is available.
691      * @since 1.10.0
692      */
693     public String getHeaderComment() {
694         return headerComment;
695     }
696 
697     /**
698      * Gets a copy of the header map as defined in the CSVFormat's header.
699      * <p>
700      * The map keys are column names. The map values are 0-based indices.
701      * </p>
702      * <p>
703      * Note: The map can only provide a one-to-one mapping when the format did not
704      * contain null or duplicate column names.
705      * </p>
706      *
707      * @return a copy of the header map.
708      */
709     public Map<String, Integer> getHeaderMap() {
710         if (headers.headerMap == null) {
711             return null;
712         }
713         final Map<String, Integer> map = createEmptyHeaderMap();
714         map.putAll(headers.headerMap);
715         return map;
716     }
717 
718     /**
719      * Gets the underlying header map.
720      *
721      * @return the underlying header map.
722      */
723     Map<String, Integer> getHeaderMapRaw() {
724         return headers.headerMap;
725     }
726 
727     /**
728      * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
729      * <p>
730      * Note: The list provides strings that can be used as keys in the header map.
731      * The list will not contain null column names if they were present in the input
732      * format.
733      * </p>
734      *
735      * @return read-only list of header names that iterates in column order.
736      * @see #getHeaderMap()
737      * @since 1.7
738      */
739     public List<String> getHeaderNames() {
740         return Collections.unmodifiableList(headers.headerNames);
741     }
742 
743     /**
744      * Gets the current record number in the input stream.
745      *
746      * <p>
747      * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
748      * the line number.
749      * </p>
750      *
751      * @return current record number
752      */
753     public long getRecordNumber() {
754         return recordNumber;
755     }
756 
757     /**
758      * Parses the CSV input according to the given format and returns the content as a list of
759      * {@link CSVRecord CSVRecords}.
760      *
761      * <p>
762      * The returned content starts at the current parse-position in the stream.
763      * </p>
764      *
765      * @return list of {@link CSVRecord CSVRecords}, may be empty
766      * @throws UncheckedIOException
767      *             on parse error or input read-failure
768      */
769     public List<CSVRecord> getRecords() {
770         return stream().collect(Collectors.toList());
771     }
772 
773     /**
774      * Gets the trailer comment, if any.
775      * Trailer comments are located between the last record and EOF
776      *
777      * @return the trailer comment for this stream, or null if no comment is available.
778      * @since 1.10.0
779      */
780     public String getTrailerComment() {
781         return trailerComment;
782     }
783 
784     /**
785      * Handles whether the input is parsed as null
786      *
787      * @param input
788      *           the cell data to further processed
789      * @return null if input is parsed as null, or input itself if the input isn't parsed as null
790      */
791     private String handleNull(final String input) {
792         final boolean isQuoted = reusableToken.isQuoted;
793         final String nullString = format.getNullString();
794         final boolean strictQuoteMode = isStrictQuoteMode();
795         if (input.equals(nullString)) {
796             // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
797             return strictQuoteMode && isQuoted ? input : null;
798         }
799         // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
800         return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
801     }
802 
803     /**
804      * Checks whether there is a header comment.
805      * The header comment appears before the header record.
806      * Note that if the parser's format has been given an explicit header
807      * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
808      * and the header record is not being skipped
809      * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
810      * will be associated with the first record, not the header.
811      *
812      * @return true if this parser has seen a header comment, false otherwise
813      * @since 1.10.0
814      */
815     public boolean hasHeaderComment() {
816         return headerComment != null;
817     }
818 
819     /**
820      * Checks whether there is a trailer comment.
821      * Trailer comments are located between the last record and EOF.
822      * The trailer comments will only be available after the parser has
823      * finished processing this stream.
824      *
825      * @return true if this parser has seen a trailer comment, false otherwise
826      * @since 1.10.0
827      */
828     public boolean hasTrailerComment() {
829         return trailerComment != null;
830     }
831 
832     /**
833      * Tests whether this parser is closed.
834      *
835      * @return whether this parser is closed.
836      */
837     public boolean isClosed() {
838         return lexer.isClosed();
839     }
840 
841     /**
842      * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
843      *
844      * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
845      *         {@link QuoteMode#NON_NUMERIC}.
846      */
847     private boolean isStrictQuoteMode() {
848         return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
849                format.getQuoteMode() == QuoteMode.NON_NUMERIC;
850     }
851 
852     /**
853      * Returns the record iterator.
854      *
855      * <p>
856      * An {@link IOException} caught during the iteration is re-thrown as an
857      * {@link IllegalStateException}.
858      * </p>
859      * <p>
860      * If the parser is closed, the iterator will not yield any more records.
861      * A call to {@link Iterator#hasNext()} will return {@code false} and
862      * a call to {@link Iterator#next()} will throw a
863      * {@link NoSuchElementException}.
864      * </p>
865      * <p>
866      * If it is necessary to construct an iterator which is usable after the
867      * parser is closed, one option is to extract all records as a list with
868      * {@link #getRecords()}, and return an iterator to that list.
869      * </p>
870      */
871     @Override
872     public Iterator<CSVRecord> iterator() {
873         return csvRecordIterator;
874     }
875 
876     /**
877      * Parses the next record from the current point in the stream.
878      *
879      * @return the record as an array of values, or {@code null} if the end of the stream has been reached
880      * @throws IOException  on parse error or input read-failure
881      * @throws CSVException on invalid input.
882      */
883     CSVRecord nextRecord() throws IOException {
884         CSVRecord result = null;
885         recordList.clear();
886         StringBuilder sb = null;
887         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
888         final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
889         do {
890             reusableToken.reset();
891             lexer.nextToken(reusableToken);
892             switch (reusableToken.type) {
893             case TOKEN:
894                 addRecordValue(false);
895                 break;
896             case EORECORD:
897                 addRecordValue(true);
898                 break;
899             case EOF:
900                 if (reusableToken.isReady) {
901                     addRecordValue(true);
902                 } else if (sb != null) {
903                     trailerComment = sb.toString();
904                 }
905                 break;
906             case INVALID:
907                 throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber());
908             case COMMENT: // Ignored currently
909                 if (sb == null) { // first comment for this record
910                     sb = new StringBuilder();
911                 } else {
912                     sb.append(Constants.LF);
913                 }
914                 sb.append(reusableToken.content);
915                 reusableToken.type = TOKEN; // Read another token
916                 break;
917             default:
918                 throw new CSVException("Unexpected Token type: %s", reusableToken.type);
919             }
920         } while (reusableToken.type == TOKEN);
921 
922         if (!recordList.isEmpty()) {
923             recordNumber++;
924             final String comment = Objects.toString(sb, null);
925             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
926                 recordNumber, startCharPosition, startBytePosition);
927         }
928         return result;
929     }
930 
931     /**
932      * Returns a sequential {@code Stream} with this collection as its source.
933      * <p>
934      * If the parser is closed, the stream will not produce any more values.
935      * See the comments in {@link #iterator()}.
936      * </p>
937      * @return a sequential {@code Stream} with this collection as its source.
938      * @since 1.9.0
939      */
940     public Stream<CSVRecord> stream() {
941         return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
942     }
943 
944 }