1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.csv.Token.Type.TOKEN;
23
24 import java.io.Closeable;
25 import java.io.File;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.InputStreamReader;
29 import java.io.Reader;
30 import java.io.StringReader;
31 import java.io.UncheckedIOException;
32 import java.net.URL;
33 import java.nio.charset.Charset;
34 import java.nio.file.Files;
35 import java.nio.file.Path;
36 import java.util.ArrayList;
37 import java.util.Arrays;
38 import java.util.Collections;
39 import java.util.Iterator;
40 import java.util.LinkedHashMap;
41 import java.util.List;
42 import java.util.Map;
43 import java.util.NoSuchElementException;
44 import java.util.Objects;
45 import java.util.Spliterator;
46 import java.util.Spliterators;
47 import java.util.TreeMap;
48 import java.util.stream.Collectors;
49 import java.util.stream.Stream;
50 import java.util.stream.StreamSupport;
51
52 import org.apache.commons.io.Charsets;
53 import org.apache.commons.io.build.AbstractStreamBuilder;
54 import org.apache.commons.io.function.Uncheck;
55
56 /**
57 * Parses CSV files according to the specified format.
58 *
59 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
60 * specification of a {@link CSVFormat}.
61 *
62 * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
63 *
64 * <h2>Creating instances</h2>
65 * <p>
66 * There are several static factory methods that can be used to create instances for various types of resources:
67 * </p>
68 * <ul>
69 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
70 * <li>{@link #parse(String, CSVFormat)}</li>
71 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
72 * </ul>
73 * <p>
74 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
75 *
76 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
77 * </p>
78 * <pre>
79 * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
80 * ...
81 * }
82 * </pre>
83 *
84 * <h2>Parsing record wise</h2>
85 * <p>
86 * To parse a CSV input from a file, you write:
87 * </p>
88 *
89 * <pre>{@code
90 * File csvData = new File("/path/to/csv");
91 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
92 * for (CSVRecord csvRecord : parser) {
93 * ...
94 * }}
95 * </pre>
96 *
97 * <p>
98 * This will read the parse the contents of the file using the
99 * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
100 * </p>
101 *
102 * <p>
103 * To parse CSV input in a format like Excel, you write:
104 * </p>
105 *
106 * <pre>
107 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
108 * for (CSVRecord csvRecord : parser) {
109 * ...
110 * }
111 * </pre>
112 *
113 * <p>
114 * If the predefined formats don't match the format at hand, custom formats can be defined. More information about
115 * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
116 * </p>
117 *
118 * <h2>Parsing into memory</h2>
119 * <p>
120 * If parsing record-wise is not desired, the contents of the input can be read completely into memory.
121 * </p>
122 *
123 * <pre>{@code
124 * Reader in = new StringReader("a;b\nc;d");
125 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
126 * List<CSVRecord> list = parser.getRecords();
127 * }</pre>
128 *
129 * <p>
130 * There are two constraints that have to be kept in mind:
131 * </p>
132 *
133 * <ol>
134 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
135 * the input, those records will not end up in the in-memory representation of your CSV data.</li>
136 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
137 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
138 * </ol>
139 *
140 * <h2>Notes</h2>
141 * <p>
142 * The internal parser state is completely covered by the format and the reader state.
143 * </p>
144 *
145 * @see <a href="package-summary.html">package documentation for more details</a>
146 */
147 public final class CSVParser implements Iterable<CSVRecord>, Closeable {
148
149 /**
150 * Builds a new {@link CSVParser}.
151 *
152 * @since 1.13.0
153 */
154 public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
155
156 private CSVFormat format;
157 private long characterOffset;
158 private long recordNumber = 1;
159 private boolean trackBytes;
160
161 /**
162 * Constructs a new instance.
163 */
164 protected Builder() {
165 // empty
166 }
167
168 @SuppressWarnings("resource")
169 @Override
170 public CSVParser get() throws IOException {
171 return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
172 }
173
174 /**
175 * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
176 *
177 * @param characterOffset the lexer offset.
178 * @return {@code this} instance.
179 */
180 public Builder setCharacterOffset(final long characterOffset) {
181 this.characterOffset = characterOffset;
182 return asThis();
183 }
184
185 /**
186 * Sets the CSV format. A copy of the given format is kept.
187 *
188 * @param format the CSV format, {@code null} resets to {@link CSVFormat#DEFAULT}.
189 * @return {@code this} instance.
190 */
191 public Builder setFormat(final CSVFormat format) {
192 this.format = CSVFormat.copy(format);
193 return asThis();
194 }
195
196 /**
197 * Sets the next record number to assign, defaults to {@code 1}.
198 *
199 * @param recordNumber the next record number to assign.
200 * @return {@code this} instance.
201 */
202 public Builder setRecordNumber(final long recordNumber) {
203 this.recordNumber = recordNumber;
204 return asThis();
205 }
206
207 /**
208 * Sets whether to enable byte tracking for the parser.
209 *
210 * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
211 * @return {@code this} instance.
212 * @since 1.13.0
213 */
214 public Builder setTrackBytes(final boolean trackBytes) {
215 this.trackBytes = trackBytes;
216 return asThis();
217 }
218
219 }
220
221 final class CSVRecordIterator implements Iterator<CSVRecord> {
222 private CSVRecord current;
223
224 /**
225 * Gets the next record or null at the end of stream or max rows read.
226 *
227 * @throws IOException on parse error or input read-failure
228 * @throws CSVException on invalid input.
229 * @return the next record, or {@code null} if the end of the stream has been reached.
230 */
231 private CSVRecord getNextRecord() {
232 CSVRecord record = null;
233 if (format.useRow(recordNumber + 1)) {
234 record = Uncheck.get(CSVParser.this::nextRecord);
235 }
236 return record;
237 }
238
239 @Override
240 public boolean hasNext() {
241 if (isClosed()) {
242 return false;
243 }
244 if (current == null) {
245 current = getNextRecord();
246 }
247 return current != null;
248 }
249
250 @Override
251 public CSVRecord next() {
252 if (isClosed()) {
253 throw new NoSuchElementException("CSVParser has been closed");
254 }
255 CSVRecord next = current;
256 current = null;
257 if (next == null) {
258 // hasNext() wasn't called before
259 next = getNextRecord();
260 if (next == null) {
261 throw new NoSuchElementException("No more CSV records available");
262 }
263 }
264 return next;
265 }
266
267 @Override
268 public void remove() {
269 throw new UnsupportedOperationException();
270 }
271 }
272 /**
273 * Header information based on name and position.
274 */
275 private static final class Headers {
276
277 /**
278 * Header column positions (0-based)
279 */
280 final Map<String, Integer> headerMap;
281
282 /**
283 * Header names in column order
284 */
285 final List<String> headerNames;
286
287 Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
288 this.headerMap = headerMap;
289 this.headerNames = headerNames;
290 }
291 }
292
293 /**
294 * Creates a new builder.
295 *
296 * @return a new builder.
297 * @since 1.13.0
298 */
299 public static Builder builder() {
300 return new Builder();
301 }
302
303 /**
304 * Creates a parser for the given {@link File}.
305 *
306 * @param file
307 * a CSV file. Must not be null.
308 * @param charset
309 * The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
310 * @param format
311 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
312 * @return a new parser
313 * @throws IllegalArgumentException
314 * If the parameters of the format are inconsistent.
315 * @throws IOException
316 * If an I/O error occurs
317 * @throws CSVException Thrown on invalid CSV input data.
318 * @throws NullPointerException if {@code file} is {@code null}.
319 */
320 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
321 Objects.requireNonNull(file, "file");
322 return parse(file.toPath(), charset, format);
323 }
324
325 /**
326 * Creates a CSV parser using the given {@link CSVFormat}.
327 *
328 * <p>
329 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
330 * unless you close the {@code reader}.
331 * </p>
332 *
333 * @param inputStream
334 * an InputStream containing CSV-formatted input, {@code null} maps to {@link CSVFormat#DEFAULT}.
335 * @param charset
336 * The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
337 * @param format
338 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
339 * @return a new CSVParser configured with the given reader and format.
340 * @throws IllegalArgumentException
341 * If the parameters of the format are inconsistent or if either reader or format are null.
342 * @throws IOException
343 * If there is a problem reading the header or skipping the first record
344 * @throws CSVException Thrown on invalid CSV input data.
345 * @since 1.5
346 */
347 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
348 throws IOException {
349 return parse(new InputStreamReader(inputStream, Charsets.toCharset(charset)), format);
350 }
351
352 /**
353 * Creates and returns a parser for the given {@link Path}, which the caller MUST close.
354 *
355 * @param path
356 * a CSV file. Must not be null.
357 * @param charset
358 * The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
359 * @param format
360 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
361 * @return a new parser
362 * @throws IllegalArgumentException
363 * If the parameters of the format are inconsistent.
364 * @throws IOException
365 * If an I/O error occurs
366 * @throws CSVException Thrown on invalid CSV input data.
367 * @throws NullPointerException if {@code path} is {@code null}.
368 * @since 1.5
369 */
370 @SuppressWarnings("resource")
371 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
372 Objects.requireNonNull(path, "path");
373 return parse(Files.newInputStream(path), charset, format);
374 }
375
376 /**
377 * Creates a CSV parser using the given {@link CSVFormat}
378 *
379 * <p>
380 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
381 * unless you close the {@code reader}.
382 * </p>
383 *
384 * @param reader
385 * a Reader containing CSV-formatted input. Must not be null.
386 * @param format
387 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
388 * @return a new CSVParser configured with the given reader and format.
389 * @throws IllegalArgumentException
390 * If the parameters of the format are inconsistent or if either reader or format are null.
391 * @throws IOException
392 * If there is a problem reading the header or skipping the first record
393 * @throws CSVException Thrown on invalid CSV input data.
394 * @since 1.5
395 */
396 public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
397 return builder().setReader(reader).setFormat(format).get();
398 }
399
400 /**
401 * Creates a parser for the given {@link String}.
402 *
403 * @param string
404 * a CSV string. Must not be null.
405 * @param format
406 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
407 * @return a new parser
408 * @throws IllegalArgumentException
409 * If the parameters of the format are inconsistent.
410 * @throws IOException
411 * If an I/O error occurs
412 * @throws CSVException Thrown on invalid CSV input data.
413 * @throws NullPointerException if {@code string} is {@code null}.
414 */
415 public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
416 Objects.requireNonNull(string, "string");
417 return parse(new StringReader(string), format);
418 }
419
420 /**
421 * Creates and returns a parser for the given URL, which the caller MUST close.
422 *
423 * <p>
424 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
425 * you close the {@code url}.
426 * </p>
427 *
428 * @param url
429 * a URL. Must not be null.
430 * @param charset
431 * the charset for the resource, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
432 * @param format
433 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
434 * @return a new parser
435 * @throws IllegalArgumentException
436 * If the parameters of the format are inconsistent.
437 * @throws IOException
438 * If an I/O error occurs
439 * @throws CSVException Thrown on invalid CSV input data.
440 * @throws NullPointerException if {@code url} is {@code null}.
441 */
442 @SuppressWarnings("resource")
443 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
444 Objects.requireNonNull(url, "url");
445 return parse(url.openStream(), charset, format);
446 }
447
448 private String headerComment;
449
450 private String trailerComment;
451
452 private final CSVFormat format;
453
454 private final Headers headers;
455
456 private final Lexer lexer;
457
458 private final CSVRecordIterator csvRecordIterator;
459
460 /** A record buffer for getRecord(). Grows as necessary and is reused. */
461 private final List<String> recordList = new ArrayList<>();
462
463 /**
464 * The next record number to assign.
465 */
466 private long recordNumber;
467
468 /**
469 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
470 * with {@link #recordNumber}.
471 */
472 private final long characterOffset;
473
474 private final Token reusableToken = new Token();
475
476 /**
477 * Constructs a new instance using the given {@link CSVFormat}.
478 *
479 * <p>
480 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
481 * unless you close the {@code reader}.
482 * </p>
483 *
484 * @param reader
485 * a Reader containing CSV-formatted input. Must not be null.
486 * @param format
487 * the CSVFormat used for CSV parsing. Must not be null.
488 * @throws IllegalArgumentException
489 * If the parameters of the format are inconsistent or if either reader or format are null.
490 * @throws IOException
491 * If there is a problem reading the header or skipping the first record
492 * @throws CSVException Thrown on invalid CSV input data.
493 * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
494 */
495 @Deprecated
496 public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
497 this(reader, format, 0, 1);
498 }
499
500 /**
501 * Constructs a new instance using the given {@link CSVFormat}.
502 *
503 * <p>
504 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
505 * unless you close the {@code reader}.
506 * </p>
507 *
508 * @param reader
509 * a Reader containing CSV-formatted input. Must not be null.
510 * @param format
511 * the CSVFormat used for CSV parsing. Must not be null.
512 * @param characterOffset
513 * Lexer offset when the parser does not start parsing at the beginning of the source.
514 * @param recordNumber
515 * The next record number to assign.
516 * @throws IllegalArgumentException
517 * If the parameters of the format are inconsistent or if either the reader or format is null.
518 * @throws IOException
519 * if there is a problem reading the header or skipping the first record
520 * @throws CSVException on invalid input.
521 * @since 1.1
522 * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
523 */
524 @Deprecated
525 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException {
526 this(reader, format, characterOffset, recordNumber, null, false);
527 }
528
529 /**
530 * Constructs a new instance using the given {@link CSVFormat}.
531 *
532 * <p>
533 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
534 * unless you close the {@code reader}.
535 * </p>
536 *
537 * @param reader
538 * a Reader containing CSV-formatted input. Must not be null.
539 * @param format
540 * the CSVFormat used for CSV parsing. Must not be null.
541 * @param characterOffset
542 * Lexer offset when the parser does not start parsing at the beginning of the source.
543 * @param recordNumber
544 * The next record number to assign.
545 * @param charset
546 * The character encoding to be used for the reader when enableByteTracking is true.
547 * @param trackBytes
548 * {@code true} to enable byte tracking for the parser; {@code false} to disable it.
549 * @throws IllegalArgumentException
550 * If the parameters of the format are inconsistent or if either the reader or format is null.
551 * @throws IOException
552 * If there is a problem reading the header or skipping the first record.
553 * @throws CSVException Thrown on invalid CSV input data.
554 */
555 @SuppressWarnings("resource") // reader is managed by lexer.
556 private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset,
557 final boolean trackBytes) throws IOException {
558 Objects.requireNonNull(reader, "reader");
559 Objects.requireNonNull(format, "format");
560 this.format = format.copy();
561 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
562 this.csvRecordIterator = new CSVRecordIterator();
563 this.headers = createHeaders();
564 this.characterOffset = characterOffset;
565 this.recordNumber = recordNumber - 1;
566 }
567
568 private void addRecordValue(final boolean lastRecord) {
569 final String input = format.trim(reusableToken.content.toString());
570 if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
571 return;
572 }
573 recordList.add(handleNull(input));
574 }
575
576 /**
577 * Closes resources.
578 *
579 * @throws IOException
580 * If an I/O error occurs
581 */
582 @Override
583 public void close() throws IOException {
584 lexer.close();
585 }
586
587 private Map<String, Integer> createEmptyHeaderMap() {
588 return format.getIgnoreHeaderCase() ?
589 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
590 new LinkedHashMap<>();
591 }
592
593 /**
594 * Creates the name to index mapping if the format defines a header.
595 *
596 * @return null if the format has no header.
597 * @throws IOException if there is a problem reading the header or skipping the first record
598 * @throws CSVException on invalid input.
599 */
600 private Headers createHeaders() throws IOException {
601 Map<String, Integer> headerMap = null;
602 List<String> headerNames = null;
603 final String[] formatHeader = format.getHeader();
604 if (formatHeader != null) {
605 headerMap = createEmptyHeaderMap();
606 String[] headerRecord = null;
607 if (formatHeader.length == 0) {
608 // read the header from the first line of the file
609 final CSVRecord nextRecord = nextRecord();
610 if (nextRecord != null) {
611 headerRecord = nextRecord.values();
612 headerComment = nextRecord.getComment();
613 }
614 } else {
615 if (format.getSkipHeaderRecord()) {
616 final CSVRecord nextRecord = nextRecord();
617 if (nextRecord != null) {
618 headerComment = nextRecord.getComment();
619 }
620 }
621 headerRecord = formatHeader;
622 }
623 // build the name to index mappings
624 if (headerRecord != null) {
625 // Track an occurrence of a null, empty or blank header.
626 boolean observedMissing = false;
627 for (int i = 0; i < headerRecord.length; i++) {
628 final String header = headerRecord[i];
629 final boolean blankHeader = CSVFormat.isBlank(header);
630 if (blankHeader && !format.getAllowMissingColumnNames()) {
631 throw new IllegalArgumentException("A header name is missing in " + Arrays.toString(headerRecord));
632 }
633 final boolean containsHeader = blankHeader ? observedMissing : headerMap.containsKey(header);
634 final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
635 final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
636 final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
637 if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
638 throw new IllegalArgumentException(String.format(
639 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
640 header, Arrays.toString(headerRecord)));
641 }
642 observedMissing |= blankHeader;
643 if (header != null) {
644 headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional
645 if (headerNames == null) {
646 headerNames = new ArrayList<>(headerRecord.length);
647 }
648 headerNames.add(header);
649 }
650 }
651 }
652 }
653 // Make header names Collection immutable
654 return new Headers(headerMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
655 }
656
657 /**
658 * Gets the current line number in the input stream.
659 *
660 * <p>
661 * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
662 * the record number.
663 * </p>
664 *
665 * @return current line number.
666 */
667 public long getCurrentLineNumber() {
668 return lexer.getCurrentLineNumber();
669 }
670
671 /**
672 * Gets the first end-of-line string encountered.
673 *
674 * @return the first end-of-line string.
675 * @since 1.5
676 */
677 public String getFirstEndOfLine() {
678 return lexer.getFirstEol();
679 }
680
681 /**
682 * Gets the header comment, if any.
683 * The header comment appears before the header record.
684 *
685 * @return the header comment for this stream, or null if no comment is available.
686 * @since 1.10.0
687 */
688 public String getHeaderComment() {
689 return headerComment;
690 }
691
692 /**
693 * Gets a copy of the header map as defined in the CSVFormat's header.
694 * <p>
695 * The map keys are column names. The map values are 0-based indices.
696 * </p>
697 * <p>
698 * <strong>Note:</strong> The map can only provide a one-to-one mapping when the format did not
699 * contain null or duplicate column names.
700 * </p>
701 *
702 * @return a copy of the header map.
703 */
704 public Map<String, Integer> getHeaderMap() {
705 if (headers.headerMap == null) {
706 return null;
707 }
708 final Map<String, Integer> map = createEmptyHeaderMap();
709 map.putAll(headers.headerMap);
710 return map;
711 }
712
713 /**
714 * Gets the underlying header map.
715 *
716 * @return the underlying header map.
717 */
718 Map<String, Integer> getHeaderMapRaw() {
719 return headers.headerMap;
720 }
721
722 /**
723 * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
724 * <p>
725 * Note: The list provides strings that can be used as keys in the header map.
726 * The list will not contain null column names if they were present in the input
727 * format.
728 * </p>
729 *
730 * @return read-only list of header names that iterates in column order.
731 * @see #getHeaderMap()
732 * @since 1.7
733 */
734 public List<String> getHeaderNames() {
735 return Collections.unmodifiableList(headers.headerNames);
736 }
737
738 /**
739 * Gets the current record number in the input stream.
740 *
741 * <p>
742 * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
743 * the line number.
744 * </p>
745 *
746 * @return current record number
747 */
748 public long getRecordNumber() {
749 return recordNumber;
750 }
751
752 /**
753 * Parses the CSV input according to the given format and returns the content as a list of
754 * {@link CSVRecord CSVRecords}.
755 *
756 * <p>
757 * The returned content starts at the current parse-position in the stream.
758 * </p>
759 * <p>
760 * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows this method produces.
761 * </p>
762 *
763 * @return list of {@link CSVRecord CSVRecords}, may be empty
764 * @throws UncheckedIOException
765 * on parse error or input read-failure
766 */
767 public List<CSVRecord> getRecords() {
768 return stream().collect(Collectors.toList());
769 }
770
771 /**
772 * Gets the trailer comment, if any.
773 * Trailer comments are located between the last record and EOF
774 *
775 * @return the trailer comment for this stream, or null if no comment is available.
776 * @since 1.10.0
777 */
778 public String getTrailerComment() {
779 return trailerComment;
780 }
781
782 /**
783 * Handles whether the input is parsed as null
784 *
785 * @param input
786 * the cell data to further processed
787 * @return null if input is parsed as null, or input itself if the input isn't parsed as null
788 */
789 private String handleNull(final String input) {
790 final boolean isQuoted = reusableToken.isQuoted;
791 final String nullString = format.getNullString();
792 final boolean strictQuoteMode = isStrictQuoteMode();
793 if (input.equals(nullString)) {
794 // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
795 return strictQuoteMode && isQuoted ? input : null;
796 }
797 // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
798 return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
799 }
800
801 /**
802 * Checks whether there is a header comment.
803 * The header comment appears before the header record.
804 * Note that if the parser's format has been given an explicit header
805 * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
806 * and the header record is not being skipped
807 * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
808 * will be associated with the first record, not the header.
809 *
810 * @return true if this parser has seen a header comment, false otherwise
811 * @since 1.10.0
812 */
813 public boolean hasHeaderComment() {
814 return headerComment != null;
815 }
816
817 /**
818 * Checks whether there is a trailer comment.
819 * Trailer comments are located between the last record and EOF.
820 * The trailer comments will only be available after the parser has
821 * finished processing this stream.
822 *
823 * @return true if this parser has seen a trailer comment, false otherwise
824 * @since 1.10.0
825 */
826 public boolean hasTrailerComment() {
827 return trailerComment != null;
828 }
829
830 /**
831 * Tests whether this parser is closed.
832 *
833 * @return whether this parser is closed.
834 */
835 public boolean isClosed() {
836 return lexer.isClosed();
837 }
838
839 /**
840 * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
841 *
842 * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
843 * {@link QuoteMode#NON_NUMERIC}.
844 */
845 private boolean isStrictQuoteMode() {
846 return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
847 format.getQuoteMode() == QuoteMode.NON_NUMERIC;
848 }
849
850 /**
851 * Returns the record iterator.
852 *
853 * <p>
854 * An {@link IOException} caught during the iteration is re-thrown as an
855 * {@link IllegalStateException}.
856 * </p>
857 * <p>
858 * If the parser is closed, the iterator will not yield any more records.
859 * A call to {@link Iterator#hasNext()} will return {@code false} and
860 * a call to {@link Iterator#next()} will throw a
861 * {@link NoSuchElementException}.
862 * </p>
863 * <p>
864 * If it is necessary to construct an iterator which is usable after the
865 * parser is closed, one option is to extract all records as a list with
866 * {@link #getRecords()}, and return an iterator to that list.
867 * </p>
868 * <p>
869 * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows an Iterator produces.
870 * </p>
871 */
872 @Override
873 public Iterator<CSVRecord> iterator() {
874 return csvRecordIterator;
875 }
876
877 /**
878 * Parses the next record from the current point in the stream.
879 *
880 * @return the record as an array of values, or {@code null} if the end of the stream has been reached.
881 * @throws IOException on parse error or input read-failure.
882 * @throws CSVException on invalid CSV input data.
883 */
884 CSVRecord nextRecord() throws IOException {
885 CSVRecord result = null;
886 recordList.clear();
887 StringBuilder sb = null;
888 final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
889 final long startBytePosition = lexer.getBytesRead() + characterOffset;
890 do {
891 reusableToken.reset();
892 lexer.nextToken(reusableToken);
893 switch (reusableToken.type) {
894 case TOKEN:
895 addRecordValue(false);
896 break;
897 case EORECORD:
898 addRecordValue(true);
899 break;
900 case EOF:
901 if (reusableToken.isReady) {
902 addRecordValue(true);
903 } else if (sb != null) {
904 trailerComment = sb.toString();
905 }
906 break;
907 case INVALID:
908 throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber());
909 case COMMENT: // Ignored currently
910 if (sb == null) { // first comment for this record
911 sb = new StringBuilder();
912 } else {
913 sb.append(Constants.LF);
914 }
915 sb.append(reusableToken.content);
916 reusableToken.type = TOKEN; // Read another token
917 break;
918 default:
919 throw new CSVException("Unexpected Token type: %s", reusableToken.type);
920 }
921 } while (reusableToken.type == TOKEN);
922 if (!recordList.isEmpty()) {
923 recordNumber++;
924 result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), Objects.toString(sb, null), recordNumber, startCharPosition,
925 startBytePosition);
926 }
927 return result;
928 }
929
930 /**
931 * Returns a sequential {@code Stream} with this collection as its source.
932 * <p>
933 * If the parser is closed, the stream will not produce any more values.
934 * See the comments in {@link #iterator()}.
935 * </p>
936 * <p>
937 * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows a Stream produces.
938 * </p>
939 *
940 * @return a sequential {@code Stream} with this collection as its source.
941 * @since 1.9.0
942 */
943 public Stream<CSVRecord> stream() {
944 return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
945 }
946
947 }