001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019 020package org.apache.commons.csv; 021 022import static org.apache.commons.csv.Token.Type.TOKEN; 023 024import java.io.Closeable; 025import java.io.File; 026import java.io.IOException; 027import java.io.InputStream; 028import java.io.InputStreamReader; 029import java.io.Reader; 030import java.io.StringReader; 031import java.io.UncheckedIOException; 032import java.net.URL; 033import java.nio.charset.Charset; 034import java.nio.file.Files; 035import java.nio.file.Path; 036import java.util.ArrayList; 037import java.util.Arrays; 038import java.util.Collections; 039import java.util.Iterator; 040import java.util.LinkedHashMap; 041import java.util.List; 042import java.util.Map; 043import java.util.NoSuchElementException; 044import java.util.Objects; 045import java.util.Spliterator; 046import java.util.Spliterators; 047import java.util.TreeMap; 048import java.util.stream.Collectors; 049import java.util.stream.Stream; 050import java.util.stream.StreamSupport; 051 052import org.apache.commons.io.Charsets; 053import org.apache.commons.io.build.AbstractStreamBuilder; 054import org.apache.commons.io.function.Uncheck; 055 056/** 057 * Parses CSV files according to the specified format. 058 * 059 * Because CSV appears in many different dialects, the parser supports many formats by allowing the 060 * specification of a {@link CSVFormat}. 061 * 062 * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream. 063 * 064 * <h2>Creating instances</h2> 065 * <p> 066 * There are several static factory methods that can be used to create instances for various types of resources: 067 * </p> 068 * <ul> 069 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> 070 * <li>{@link #parse(String, CSVFormat)}</li> 071 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> 072 * </ul> 073 * <p> 074 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. 075 * 076 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: 077 * </p> 078 * <pre> 079 * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) { 080 * ... 081 * } 082 * </pre> 083 * 084 * <h2>Parsing record wise</h2> 085 * <p> 086 * To parse a CSV input from a file, you write: 087 * </p> 088 * 089 * <pre>{@code 090 * File csvData = new File("/path/to/csv"); 091 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); 092 * for (CSVRecord csvRecord : parser) { 093 * ... 094 * }} 095 * </pre> 096 * 097 * <p> 098 * This will read the parse the contents of the file using the 099 * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. 100 * </p> 101 * 102 * <p> 103 * To parse CSV input in a format like Excel, you write: 104 * </p> 105 * 106 * <pre> 107 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); 108 * for (CSVRecord csvRecord : parser) { 109 * ... 110 * } 111 * </pre> 112 * 113 * <p> 114 * If the predefined formats don't match the format at hand, custom formats can be defined. More information about 115 * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. 116 * </p> 117 * 118 * <h2>Parsing into memory</h2> 119 * <p> 120 * If parsing record-wise is not desired, the contents of the input can be read completely into memory. 121 * </p> 122 * 123 * <pre>{@code 124 * Reader in = new StringReader("a;b\nc;d"); 125 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); 126 * List<CSVRecord> list = parser.getRecords(); 127 * }</pre> 128 * 129 * <p> 130 * There are two constraints that have to be kept in mind: 131 * </p> 132 * 133 * <ol> 134 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from 135 * the input, those records will not end up in the in-memory representation of your CSV data.</li> 136 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're 137 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> 138 * </ol> 139 * 140 * <h2>Notes</h2> 141 * <p> 142 * The internal parser state is completely covered by the format and the reader state. 143 * </p> 144 * 145 * @see <a href="package-summary.html">package documentation for more details</a> 146 */ 147public final class CSVParser implements Iterable<CSVRecord>, Closeable { 148 149 /** 150 * Builds a new {@link CSVParser}. 151 * 152 * @since 1.13.0 153 */ 154 public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> { 155 156 private CSVFormat format; 157 private long characterOffset; 158 private long recordNumber = 1; 159 private boolean trackBytes; 160 161 /** 162 * Constructs a new instance. 163 */ 164 protected Builder() { 165 // empty 166 } 167 168 @SuppressWarnings("resource") 169 @Override 170 public CSVParser get() throws IOException { 171 return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes); 172 } 173 174 /** 175 * Sets the lexer offset when the parser does not start parsing at the beginning of the source. 176 * 177 * @param characterOffset the lexer offset. 178 * @return this instance. 179 */ 180 public Builder setCharacterOffset(final long characterOffset) { 181 this.characterOffset = characterOffset; 182 return asThis(); 183 } 184 185 /** 186 * Sets the CSV format. A copy of the given format is kept. 187 * 188 * @param format the CSV format, {@code null} resets to {@link CSVFormat#DEFAULT}. 189 * @return this instance. 190 */ 191 public Builder setFormat(final CSVFormat format) { 192 this.format = CSVFormat.copy(format); 193 return asThis(); 194 } 195 196 /** 197 * Sets the next record number to assign, defaults to {@code 1}. 198 * 199 * @param recordNumber the next record number to assign. 200 * @return this instance. 201 */ 202 public Builder setRecordNumber(final long recordNumber) { 203 this.recordNumber = recordNumber; 204 return asThis(); 205 } 206 207 /** 208 * Sets whether to enable byte tracking for the parser. 209 * 210 * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it. 211 * @return this instance. 212 * @since 1.13.0 213 */ 214 public Builder setTrackBytes(final boolean trackBytes) { 215 this.trackBytes = trackBytes; 216 return asThis(); 217 } 218 219 } 220 221 final class CSVRecordIterator implements Iterator<CSVRecord> { 222 private CSVRecord current; 223 224 /** 225 * Gets the next record or null at the end of stream or max rows read. 226 * 227 * @throws IOException on parse error or input read-failure 228 * @throws CSVException on invalid input. 229 * @return the next record, or {@code null} if the end of the stream has been reached. 230 */ 231 private CSVRecord getNextRecord() { 232 CSVRecord record = null; 233 if (format.useRow(recordNumber + 1)) { 234 record = Uncheck.get(CSVParser.this::nextRecord); 235 } 236 return record; 237 } 238 239 @Override 240 public boolean hasNext() { 241 if (isClosed()) { 242 return false; 243 } 244 if (current == null) { 245 current = getNextRecord(); 246 } 247 return current != null; 248 } 249 250 @Override 251 public CSVRecord next() { 252 if (isClosed()) { 253 throw new NoSuchElementException("CSVParser has been closed"); 254 } 255 CSVRecord next = current; 256 current = null; 257 if (next == null) { 258 // hasNext() wasn't called before 259 next = getNextRecord(); 260 if (next == null) { 261 throw new NoSuchElementException("No more CSV records available"); 262 } 263 } 264 return next; 265 } 266 267 @Override 268 public void remove() { 269 throw new UnsupportedOperationException(); 270 } 271 } 272 /** 273 * Header information based on name and position. 274 */ 275 private static final class Headers { 276 277 /** 278 * Header column positions (0-based) 279 */ 280 final Map<String, Integer> headerMap; 281 282 /** 283 * Header names in column order 284 */ 285 final List<String> headerNames; 286 287 Headers(final Map<String, Integer> headerMap, final List<String> headerNames) { 288 this.headerMap = headerMap; 289 this.headerNames = headerNames; 290 } 291 } 292 293 /** 294 * Creates a new builder. 295 * 296 * @return a new builder. 297 * @since 1.13.0 298 */ 299 public static Builder builder() { 300 return new Builder(); 301 } 302 303 /** 304 * Creates a parser for the given {@link File}. 305 * 306 * @param file 307 * a CSV file. Must not be null. 308 * @param charset 309 * The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}. 310 * @param format 311 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}. 312 * @return a new parser 313 * @throws IllegalArgumentException 314 * If the parameters of the format are inconsistent. 315 * @throws IOException 316 * If an I/O error occurs 317 * @throws CSVException Thrown on invalid CSV input data. 318 * @throws NullPointerException if {@code file} is {@code null}. 319 */ 320 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { 321 Objects.requireNonNull(file, "file"); 322 return parse(file.toPath(), charset, format); 323 } 324 325 /** 326 * Creates a CSV parser using the given {@link CSVFormat}. 327 * 328 * <p> 329 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 330 * unless you close the {@code reader}. 331 * </p> 332 * 333 * @param inputStream 334 * an InputStream containing CSV-formatted input, {@code null} maps to {@link CSVFormat#DEFAULT}. 335 * @param charset 336 * The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}. 337 * @param format 338 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}. 339 * @return a new CSVParser configured with the given reader and format. 340 * @throws IllegalArgumentException 341 * If the parameters of the format are inconsistent or if either reader or format are null. 342 * @throws IOException 343 * If there is a problem reading the header or skipping the first record 344 * @throws CSVException Thrown on invalid CSV input data. 345 * @since 1.5 346 */ 347 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) 348 throws IOException { 349 return parse(new InputStreamReader(inputStream, Charsets.toCharset(charset)), format); 350 } 351 352 /** 353 * Creates and returns a parser for the given {@link Path}, which the caller MUST close. 354 * 355 * @param path 356 * a CSV file. Must not be null. 357 * @param charset 358 * The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}. 359 * @param format 360 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}. 361 * @return a new parser 362 * @throws IllegalArgumentException 363 * If the parameters of the format are inconsistent. 364 * @throws IOException 365 * If an I/O error occurs 366 * @throws CSVException Thrown on invalid CSV input data. 367 * @throws NullPointerException if {@code path} is {@code null}. 368 * @since 1.5 369 */ 370 @SuppressWarnings("resource") 371 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { 372 Objects.requireNonNull(path, "path"); 373 return parse(Files.newInputStream(path), charset, format); 374 } 375 376 /** 377 * Creates a CSV parser using the given {@link CSVFormat} 378 * 379 * <p> 380 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 381 * unless you close the {@code reader}. 382 * </p> 383 * 384 * @param reader 385 * a Reader containing CSV-formatted input. Must not be null. 386 * @param format 387 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}. 388 * @return a new CSVParser configured with the given reader and format. 389 * @throws IllegalArgumentException 390 * If the parameters of the format are inconsistent or if either reader or format are null. 391 * @throws IOException 392 * If there is a problem reading the header or skipping the first record 393 * @throws CSVException Thrown on invalid CSV input data. 394 * @since 1.5 395 */ 396 public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { 397 return builder().setReader(reader).setFormat(format).get(); 398 } 399 400 /** 401 * Creates a parser for the given {@link String}. 402 * 403 * @param string 404 * a CSV string. Must not be null. 405 * @param format 406 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}. 407 * @return a new parser 408 * @throws IllegalArgumentException 409 * If the parameters of the format are inconsistent. 410 * @throws IOException 411 * If an I/O error occurs 412 * @throws CSVException Thrown on invalid CSV input data. 413 * @throws NullPointerException if {@code string} is {@code null}. 414 */ 415 public static CSVParser parse(final String string, final CSVFormat format) throws IOException { 416 Objects.requireNonNull(string, "string"); 417 return parse(new StringReader(string), format); 418 } 419 420 /** 421 * Creates and returns a parser for the given URL, which the caller MUST close. 422 * 423 * <p> 424 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless 425 * you close the {@code url}. 426 * </p> 427 * 428 * @param url 429 * a URL. Must not be null. 430 * @param charset 431 * the charset for the resource, {@code null} maps to the {@link Charset#defaultCharset() default Charset}. 432 * @param format 433 * the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}. 434 * @return a new parser 435 * @throws IllegalArgumentException 436 * If the parameters of the format are inconsistent. 437 * @throws IOException 438 * If an I/O error occurs 439 * @throws CSVException Thrown on invalid CSV input data. 440 * @throws NullPointerException if {@code url} is {@code null}. 441 */ 442 @SuppressWarnings("resource") 443 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { 444 Objects.requireNonNull(url, "url"); 445 return parse(url.openStream(), charset, format); 446 } 447 448 private String headerComment; 449 450 private String trailerComment; 451 452 private final CSVFormat format; 453 454 private final Headers headers; 455 456 private final Lexer lexer; 457 458 private final CSVRecordIterator csvRecordIterator; 459 460 /** A record buffer for getRecord(). Grows as necessary and is reused. */ 461 private final List<String> recordList = new ArrayList<>(); 462 463 /** 464 * The next record number to assign. 465 */ 466 private long recordNumber; 467 468 /** 469 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination 470 * with {@link #recordNumber}. 471 */ 472 private final long characterOffset; 473 474 private final Token reusableToken = new Token(); 475 476 /** 477 * Constructs a new instance using the given {@link CSVFormat}. 478 * 479 * <p> 480 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 481 * unless you close the {@code reader}. 482 * </p> 483 * 484 * @param reader 485 * a Reader containing CSV-formatted input. Must not be null. 486 * @param format 487 * the CSVFormat used for CSV parsing. Must not be null. 488 * @throws IllegalArgumentException 489 * If the parameters of the format are inconsistent or if either reader or format are null. 490 * @throws IOException 491 * If there is a problem reading the header or skipping the first record 492 * @throws CSVException Thrown on invalid CSV input data. 493 * @deprecated Will be removed in the next major version, use {@link Builder#get()}. 494 */ 495 @Deprecated 496 public CSVParser(final Reader reader, final CSVFormat format) throws IOException { 497 this(reader, format, 0, 1); 498 } 499 500 /** 501 * Constructs a new instance using the given {@link CSVFormat}. 502 * 503 * <p> 504 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 505 * unless you close the {@code reader}. 506 * </p> 507 * 508 * @param reader 509 * a Reader containing CSV-formatted input. Must not be null. 510 * @param format 511 * the CSVFormat used for CSV parsing. Must not be null. 512 * @param characterOffset 513 * Lexer offset when the parser does not start parsing at the beginning of the source. 514 * @param recordNumber 515 * The next record number to assign. 516 * @throws IllegalArgumentException 517 * If the parameters of the format are inconsistent or if either the reader or format is null. 518 * @throws IOException 519 * if there is a problem reading the header or skipping the first record 520 * @throws CSVException on invalid input. 521 * @since 1.1 522 * @deprecated Will be removed in the next major version, use {@link Builder#get()}. 523 */ 524 @Deprecated 525 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { 526 this(reader, format, characterOffset, recordNumber, null, false); 527 } 528 529 /** 530 * Constructs a new instance using the given {@link CSVFormat}. 531 * 532 * <p> 533 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 534 * unless you close the {@code reader}. 535 * </p> 536 * 537 * @param reader 538 * a Reader containing CSV-formatted input. Must not be null. 539 * @param format 540 * the CSVFormat used for CSV parsing. Must not be null. 541 * @param characterOffset 542 * Lexer offset when the parser does not start parsing at the beginning of the source. 543 * @param recordNumber 544 * The next record number to assign. 545 * @param charset 546 * The character encoding to be used for the reader when enableByteTracking is true. 547 * @param trackBytes 548 * {@code true} to enable byte tracking for the parser; {@code false} to disable it. 549 * @throws IllegalArgumentException 550 * If the parameters of the format are inconsistent or if either the reader or format is null. 551 * @throws IOException 552 * If there is a problem reading the header or skipping the first record. 553 * @throws CSVException Thrown on invalid CSV input data. 554 */ 555 private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, 556 final Charset charset, final boolean trackBytes) 557 throws IOException { 558 Objects.requireNonNull(reader, "reader"); 559 Objects.requireNonNull(format, "format"); 560 this.format = format.copy(); 561 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes)); 562 this.csvRecordIterator = new CSVRecordIterator(); 563 this.headers = createHeaders(); 564 this.characterOffset = characterOffset; 565 this.recordNumber = recordNumber - 1; 566 } 567 568 private void addRecordValue(final boolean lastRecord) { 569 final String input = format.trim(reusableToken.content.toString()); 570 if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) { 571 return; 572 } 573 recordList.add(handleNull(input)); 574 } 575 576 /** 577 * Closes resources. 578 * 579 * @throws IOException 580 * If an I/O error occurs 581 */ 582 @Override 583 public void close() throws IOException { 584 lexer.close(); 585 } 586 587 private Map<String, Integer> createEmptyHeaderMap() { 588 return format.getIgnoreHeaderCase() ? 589 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : 590 new LinkedHashMap<>(); 591 } 592 593 /** 594 * Creates the name to index mapping if the format defines a header. 595 * 596 * @return null if the format has no header. 597 * @throws IOException if there is a problem reading the header or skipping the first record 598 * @throws CSVException on invalid input. 599 */ 600 private Headers createHeaders() throws IOException { 601 Map<String, Integer> headerMap = null; 602 List<String> headerNames = null; 603 final String[] formatHeader = format.getHeader(); 604 if (formatHeader != null) { 605 headerMap = createEmptyHeaderMap(); 606 String[] headerRecord = null; 607 if (formatHeader.length == 0) { 608 // read the header from the first line of the file 609 final CSVRecord nextRecord = nextRecord(); 610 if (nextRecord != null) { 611 headerRecord = nextRecord.values(); 612 headerComment = nextRecord.getComment(); 613 } 614 } else { 615 if (format.getSkipHeaderRecord()) { 616 final CSVRecord nextRecord = nextRecord(); 617 if (nextRecord != null) { 618 headerComment = nextRecord.getComment(); 619 } 620 } 621 headerRecord = formatHeader; 622 } 623 // build the name to index mappings 624 if (headerRecord != null) { 625 // Track an occurrence of a null, empty or blank header. 626 boolean observedMissing = false; 627 for (int i = 0; i < headerRecord.length; i++) { 628 final String header = headerRecord[i]; 629 final boolean blankHeader = CSVFormat.isBlank(header); 630 if (blankHeader && !format.getAllowMissingColumnNames()) { 631 throw new IllegalArgumentException("A header name is missing in " + Arrays.toString(headerRecord)); 632 } 633 final boolean containsHeader = blankHeader ? observedMissing : headerMap.containsKey(header); 634 final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode(); 635 final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL; 636 final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY; 637 if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) { 638 throw new IllegalArgumentException(String.format( 639 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().", 640 header, Arrays.toString(headerRecord))); 641 } 642 observedMissing |= blankHeader; 643 if (header != null) { 644 headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional 645 if (headerNames == null) { 646 headerNames = new ArrayList<>(headerRecord.length); 647 } 648 headerNames.add(header); 649 } 650 } 651 } 652 } 653 // Make header names Collection immutable 654 return new Headers(headerMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames)); 655 } 656 657 /** 658 * Gets the current line number in the input stream. 659 * 660 * <p> 661 * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to 662 * the record number. 663 * </p> 664 * 665 * @return current line number. 666 */ 667 public long getCurrentLineNumber() { 668 return lexer.getCurrentLineNumber(); 669 } 670 671 /** 672 * Gets the first end-of-line string encountered. 673 * 674 * @return the first end-of-line string. 675 * @since 1.5 676 */ 677 public String getFirstEndOfLine() { 678 return lexer.getFirstEol(); 679 } 680 681 /** 682 * Gets the header comment, if any. 683 * The header comment appears before the header record. 684 * 685 * @return the header comment for this stream, or null if no comment is available. 686 * @since 1.10.0 687 */ 688 public String getHeaderComment() { 689 return headerComment; 690 } 691 692 /** 693 * Gets a copy of the header map as defined in the CSVFormat's header. 694 * <p> 695 * The map keys are column names. The map values are 0-based indices. 696 * </p> 697 * <p> 698 * <strong>Note:</strong> The map can only provide a one-to-one mapping when the format did not 699 * contain null or duplicate column names. 700 * </p> 701 * 702 * @return a copy of the header map. 703 */ 704 public Map<String, Integer> getHeaderMap() { 705 if (headers.headerMap == null) { 706 return null; 707 } 708 final Map<String, Integer> map = createEmptyHeaderMap(); 709 map.putAll(headers.headerMap); 710 return map; 711 } 712 713 /** 714 * Gets the underlying header map. 715 * 716 * @return the underlying header map. 717 */ 718 Map<String, Integer> getHeaderMapRaw() { 719 return headers.headerMap; 720 } 721 722 /** 723 * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header. 724 * <p> 725 * Note: The list provides strings that can be used as keys in the header map. 726 * The list will not contain null column names if they were present in the input 727 * format. 728 * </p> 729 * 730 * @return read-only list of header names that iterates in column order. 731 * @see #getHeaderMap() 732 * @since 1.7 733 */ 734 public List<String> getHeaderNames() { 735 return Collections.unmodifiableList(headers.headerNames); 736 } 737 738 /** 739 * Gets the current record number in the input stream. 740 * 741 * <p> 742 * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to 743 * the line number. 744 * </p> 745 * 746 * @return current record number 747 */ 748 public long getRecordNumber() { 749 return recordNumber; 750 } 751 752 /** 753 * Parses the CSV input according to the given format and returns the content as a list of 754 * {@link CSVRecord CSVRecords}. 755 * 756 * <p> 757 * The returned content starts at the current parse-position in the stream. 758 * </p> 759 * <p> 760 * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows this method produces. 761 * </p> 762 * 763 * @return list of {@link CSVRecord CSVRecords}, may be empty 764 * @throws UncheckedIOException 765 * on parse error or input read-failure 766 */ 767 public List<CSVRecord> getRecords() { 768 return stream().collect(Collectors.toList()); 769 } 770 771 /** 772 * Gets the trailer comment, if any. 773 * Trailer comments are located between the last record and EOF 774 * 775 * @return the trailer comment for this stream, or null if no comment is available. 776 * @since 1.10.0 777 */ 778 public String getTrailerComment() { 779 return trailerComment; 780 } 781 782 /** 783 * Handles whether the input is parsed as null 784 * 785 * @param input 786 * the cell data to further processed 787 * @return null if input is parsed as null, or input itself if the input isn't parsed as null 788 */ 789 private String handleNull(final String input) { 790 final boolean isQuoted = reusableToken.isQuoted; 791 final String nullString = format.getNullString(); 792 final boolean strictQuoteMode = isStrictQuoteMode(); 793 if (input.equals(nullString)) { 794 // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode 795 return strictQuoteMode && isQuoted ? input : null; 796 } 797 // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode 798 return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input; 799 } 800 801 /** 802 * Checks whether there is a header comment. 803 * The header comment appears before the header record. 804 * Note that if the parser's format has been given an explicit header 805 * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload) 806 * and the header record is not being skipped 807 * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments 808 * will be associated with the first record, not the header. 809 * 810 * @return true if this parser has seen a header comment, false otherwise 811 * @since 1.10.0 812 */ 813 public boolean hasHeaderComment() { 814 return headerComment != null; 815 } 816 817 /** 818 * Checks whether there is a trailer comment. 819 * Trailer comments are located between the last record and EOF. 820 * The trailer comments will only be available after the parser has 821 * finished processing this stream. 822 * 823 * @return true if this parser has seen a trailer comment, false otherwise 824 * @since 1.10.0 825 */ 826 public boolean hasTrailerComment() { 827 return trailerComment != null; 828 } 829 830 /** 831 * Tests whether this parser is closed. 832 * 833 * @return whether this parser is closed. 834 */ 835 public boolean isClosed() { 836 return lexer.isClosed(); 837 } 838 839 /** 840 * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}. 841 * 842 * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or 843 * {@link QuoteMode#NON_NUMERIC}. 844 */ 845 private boolean isStrictQuoteMode() { 846 return format.getQuoteMode() == QuoteMode.ALL_NON_NULL || 847 format.getQuoteMode() == QuoteMode.NON_NUMERIC; 848 } 849 850 /** 851 * Returns the record iterator. 852 * 853 * <p> 854 * An {@link IOException} caught during the iteration is re-thrown as an 855 * {@link IllegalStateException}. 856 * </p> 857 * <p> 858 * If the parser is closed, the iterator will not yield any more records. 859 * A call to {@link Iterator#hasNext()} will return {@code false} and 860 * a call to {@link Iterator#next()} will throw a 861 * {@link NoSuchElementException}. 862 * </p> 863 * <p> 864 * If it is necessary to construct an iterator which is usable after the 865 * parser is closed, one option is to extract all records as a list with 866 * {@link #getRecords()}, and return an iterator to that list. 867 * </p> 868 * <p> 869 * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows an Iterator produces. 870 * </p> 871 */ 872 @Override 873 public Iterator<CSVRecord> iterator() { 874 return csvRecordIterator; 875 } 876 877 /** 878 * Parses the next record from the current point in the stream. 879 * 880 * @return the record as an array of values, or {@code null} if the end of the stream has been reached. 881 * @throws IOException on parse error or input read-failure. 882 * @throws CSVException on invalid CSV input data. 883 */ 884 CSVRecord nextRecord() throws IOException { 885 CSVRecord result = null; 886 recordList.clear(); 887 StringBuilder sb = null; 888 final long startCharPosition = lexer.getCharacterPosition() + characterOffset; 889 final long startBytePosition = lexer.getBytesRead() + this.characterOffset; 890 do { 891 reusableToken.reset(); 892 lexer.nextToken(reusableToken); 893 switch (reusableToken.type) { 894 case TOKEN: 895 addRecordValue(false); 896 break; 897 case EORECORD: 898 addRecordValue(true); 899 break; 900 case EOF: 901 if (reusableToken.isReady) { 902 addRecordValue(true); 903 } else if (sb != null) { 904 trailerComment = sb.toString(); 905 } 906 break; 907 case INVALID: 908 throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber()); 909 case COMMENT: // Ignored currently 910 if (sb == null) { // first comment for this record 911 sb = new StringBuilder(); 912 } else { 913 sb.append(Constants.LF); 914 } 915 sb.append(reusableToken.content); 916 reusableToken.type = TOKEN; // Read another token 917 break; 918 default: 919 throw new CSVException("Unexpected Token type: %s", reusableToken.type); 920 } 921 } while (reusableToken.type == TOKEN); 922 923 if (!recordList.isEmpty()) { 924 recordNumber++; 925 final String comment = Objects.toString(sb, null); 926 result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, 927 recordNumber, startCharPosition, startBytePosition); 928 } 929 return result; 930 } 931 932 /** 933 * Returns a sequential {@code Stream} with this collection as its source. 934 * <p> 935 * If the parser is closed, the stream will not produce any more values. 936 * See the comments in {@link #iterator()}. 937 * </p> 938 * <p> 939 * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows a Stream produces. 940 * </p> 941 * 942 * @return a sequential {@code Stream} with this collection as its source. 943 * @since 1.9.0 944 */ 945 public Stream<CSVRecord> stream() { 946 return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false); 947 } 948 949}