1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * https://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 20 package org.apache.commons.csv; 21 22 import static org.apache.commons.csv.Token.Type.TOKEN; 23 24 import java.io.Closeable; 25 import java.io.File; 26 import java.io.IOException; 27 import java.io.InputStream; 28 import java.io.InputStreamReader; 29 import java.io.Reader; 30 import java.io.StringReader; 31 import java.io.UncheckedIOException; 32 import java.net.URL; 33 import java.nio.charset.Charset; 34 import java.nio.file.Files; 35 import java.nio.file.Path; 36 import java.util.ArrayList; 37 import java.util.Arrays; 38 import java.util.Collections; 39 import java.util.Iterator; 40 import java.util.LinkedHashMap; 41 import java.util.List; 42 import java.util.Map; 43 import java.util.NoSuchElementException; 44 import java.util.Objects; 45 import java.util.Spliterator; 46 import java.util.Spliterators; 47 import java.util.TreeMap; 48 import java.util.stream.Collectors; 49 import java.util.stream.Stream; 50 import java.util.stream.StreamSupport; 51 52 import org.apache.commons.io.build.AbstractStreamBuilder; 53 import org.apache.commons.io.function.Uncheck; 54 55 /** 56 * Parses CSV files according to the specified format. 57 * 58 * Because CSV appears in many different dialects, the parser supports many formats by allowing the 59 * specification of a {@link CSVFormat}. 60 * 61 * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream. 62 * 63 * <h2>Creating instances</h2> 64 * <p> 65 * There are several static factory methods that can be used to create instances for various types of resources: 66 * </p> 67 * <ul> 68 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> 69 * <li>{@link #parse(String, CSVFormat)}</li> 70 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> 71 * </ul> 72 * <p> 73 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. 74 * 75 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: 76 * </p> 77 * <pre> 78 * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) { 79 * ... 80 * } 81 * </pre> 82 * 83 * <h2>Parsing record wise</h2> 84 * <p> 85 * To parse a CSV input from a file, you write: 86 * </p> 87 * 88 * <pre>{@code 89 * File csvData = new File("/path/to/csv"); 90 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); 91 * for (CSVRecord csvRecord : parser) { 92 * ... 93 * }} 94 * </pre> 95 * 96 * <p> 97 * This will read the parse the contents of the file using the 98 * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. 99 * </p> 100 * 101 * <p> 102 * To parse CSV input in a format like Excel, you write: 103 * </p> 104 * 105 * <pre> 106 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); 107 * for (CSVRecord csvRecord : parser) { 108 * ... 109 * } 110 * </pre> 111 * 112 * <p> 113 * If the predefined formats don't match the format at hand, custom formats can be defined. More information about 114 * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. 115 * </p> 116 * 117 * <h2>Parsing into memory</h2> 118 * <p> 119 * If parsing record-wise is not desired, the contents of the input can be read completely into memory. 120 * </p> 121 * 122 * <pre>{@code 123 * Reader in = new StringReader("a;b\nc;d"); 124 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); 125 * List<CSVRecord> list = parser.getRecords(); 126 * }</pre> 127 * 128 * <p> 129 * There are two constraints that have to be kept in mind: 130 * </p> 131 * 132 * <ol> 133 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from 134 * the input, those records will not end up in the in-memory representation of your CSV data.</li> 135 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're 136 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> 137 * </ol> 138 * 139 * <h2>Notes</h2> 140 * <p> 141 * The internal parser state is completely covered by the format and the reader state. 142 * </p> 143 * 144 * @see <a href="package-summary.html">package documentation for more details</a> 145 */ 146 public final class CSVParser implements Iterable<CSVRecord>, Closeable { 147 148 /** 149 * Builds a new {@link CSVParser}. 150 * 151 * @since 1.13.0 152 */ 153 public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> { 154 155 private CSVFormat format; 156 private long characterOffset; 157 private long recordNumber = 1; 158 private boolean trackBytes; 159 160 /** 161 * Constructs a new instance. 162 */ 163 protected Builder() { 164 // empty 165 } 166 167 @SuppressWarnings("resource") 168 @Override 169 public CSVParser get() throws IOException { 170 return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes); 171 } 172 173 /** 174 * Sets the lexer offset when the parser does not start parsing at the beginning of the source. 175 * 176 * @param characterOffset the lexer offset. 177 * @return this instance. 178 */ 179 public Builder setCharacterOffset(final long characterOffset) { 180 this.characterOffset = characterOffset; 181 return asThis(); 182 } 183 184 /** 185 * Sets the CSV format. A copy of the given format is kept. 186 * 187 * @param format the CSV format, null is equivalent to {@link CSVFormat#DEFAULT}. 188 * @return this instance. 189 */ 190 public Builder setFormat(final CSVFormat format) { 191 this.format = CSVFormat.copy(format); 192 return asThis(); 193 } 194 195 /** 196 * Sets the next record number to assign, defaults to {@code 1}. 197 * 198 * @param recordNumber the next record number to assign. 199 * @return this instance. 200 */ 201 public Builder setRecordNumber(final long recordNumber) { 202 this.recordNumber = recordNumber; 203 return asThis(); 204 } 205 206 /** 207 * Sets whether to enable byte tracking for the parser. 208 * 209 * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it. 210 * @return this instance. 211 * @since 1.13.0 212 */ 213 public Builder setTrackBytes(final boolean trackBytes) { 214 this.trackBytes = trackBytes; 215 return asThis(); 216 } 217 218 } 219 220 final class CSVRecordIterator implements Iterator<CSVRecord> { 221 private CSVRecord current; 222 223 /** 224 * Gets the next record. 225 * 226 * @throws IOException on parse error or input read-failure 227 * @throws CSVException on invalid input. 228 * @return the next record. 229 */ 230 private CSVRecord getNextRecord() { 231 return Uncheck.get(CSVParser.this::nextRecord); 232 } 233 234 @Override 235 public boolean hasNext() { 236 if (isClosed()) { 237 return false; 238 } 239 if (current == null) { 240 current = getNextRecord(); 241 } 242 243 return current != null; 244 } 245 246 @Override 247 public CSVRecord next() { 248 if (isClosed()) { 249 throw new NoSuchElementException("CSVParser has been closed"); 250 } 251 CSVRecord next = current; 252 current = null; 253 254 if (next == null) { 255 // hasNext() wasn't called before 256 next = getNextRecord(); 257 if (next == null) { 258 throw new NoSuchElementException("No more CSV records available"); 259 } 260 } 261 262 return next; 263 } 264 265 @Override 266 public void remove() { 267 throw new UnsupportedOperationException(); 268 } 269 } 270 /** 271 * Header information based on name and position. 272 */ 273 private static final class Headers { 274 275 /** 276 * Header column positions (0-based) 277 */ 278 final Map<String, Integer> headerMap; 279 280 /** 281 * Header names in column order 282 */ 283 final List<String> headerNames; 284 285 Headers(final Map<String, Integer> headerMap, final List<String> headerNames) { 286 this.headerMap = headerMap; 287 this.headerNames = headerNames; 288 } 289 } 290 291 /** 292 * Creates a new builder. 293 * 294 * @return a new builder. 295 * @since 1.13.0 296 */ 297 public static Builder builder() { 298 return new Builder(); 299 } 300 301 /** 302 * Creates a parser for the given {@link File}. 303 * 304 * @param file 305 * a CSV file. Must not be null. 306 * @param charset 307 * The Charset to decode the given file. 308 * @param format 309 * the CSVFormat used for CSV parsing. Must not be null. 310 * @return a new parser 311 * @throws IllegalArgumentException 312 * If the parameters of the format are inconsistent or if either file or format are null. 313 * @throws IOException 314 * If an I/O error occurs 315 * @throws CSVException Thrown on invalid input. 316 */ 317 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { 318 Objects.requireNonNull(file, "file"); 319 return parse(file.toPath(), charset, format); 320 } 321 322 /** 323 * Creates a CSV parser using the given {@link CSVFormat}. 324 * 325 * <p> 326 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 327 * unless you close the {@code reader}. 328 * </p> 329 * 330 * @param inputStream 331 * an InputStream containing CSV-formatted input. Must not be null. 332 * @param charset 333 * The Charset to decode the given file. 334 * @param format 335 * the CSVFormat used for CSV parsing. Must not be null. 336 * @return a new CSVParser configured with the given reader and format. 337 * @throws IllegalArgumentException 338 * If the parameters of the format are inconsistent or if either reader or format are null. 339 * @throws IOException 340 * If there is a problem reading the header or skipping the first record 341 * @throws CSVException Thrown on invalid input. 342 * @since 1.5 343 */ 344 @SuppressWarnings("resource") 345 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) 346 throws IOException { 347 Objects.requireNonNull(inputStream, "inputStream"); 348 Objects.requireNonNull(format, "format"); 349 return parse(new InputStreamReader(inputStream, charset), format); 350 } 351 352 /** 353 * Creates and returns a parser for the given {@link Path}, which the caller MUST close. 354 * 355 * @param path 356 * a CSV file. Must not be null. 357 * @param charset 358 * The Charset to decode the given file. 359 * @param format 360 * the CSVFormat used for CSV parsing. Must not be null. 361 * @return a new parser 362 * @throws IllegalArgumentException 363 * If the parameters of the format are inconsistent or if either file or format are null. 364 * @throws IOException 365 * If an I/O error occurs 366 * @throws CSVException Thrown on invalid input. 367 * @since 1.5 368 */ 369 @SuppressWarnings("resource") 370 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { 371 Objects.requireNonNull(path, "path"); 372 Objects.requireNonNull(format, "format"); 373 return parse(Files.newInputStream(path), charset, format); 374 } 375 376 /** 377 * Creates a CSV parser using the given {@link CSVFormat} 378 * 379 * <p> 380 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 381 * unless you close the {@code reader}. 382 * </p> 383 * 384 * @param reader 385 * a Reader containing CSV-formatted input. Must not be null. 386 * @param format 387 * the CSVFormat used for CSV parsing. Must not be null. 388 * @return a new CSVParser configured with the given reader and format. 389 * @throws IllegalArgumentException 390 * If the parameters of the format are inconsistent or if either reader or format are null. 391 * @throws IOException 392 * If there is a problem reading the header or skipping the first record 393 * @throws CSVException Thrown on invalid input. 394 * @since 1.5 395 */ 396 public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { 397 return builder().setReader(reader).setFormat(format).get(); 398 } 399 400 /** 401 * Creates a parser for the given {@link String}. 402 * 403 * @param string 404 * a CSV string. Must not be null. 405 * @param format 406 * the CSVFormat used for CSV parsing. Must not be null. 407 * @return a new parser 408 * @throws IllegalArgumentException 409 * If the parameters of the format are inconsistent or if either string or format are null. 410 * @throws IOException 411 * If an I/O error occurs 412 * @throws CSVException Thrown on invalid input. 413 */ 414 public static CSVParser parse(final String string, final CSVFormat format) throws IOException { 415 Objects.requireNonNull(string, "string"); 416 Objects.requireNonNull(format, "format"); 417 return parse(new StringReader(string), format); 418 } 419 420 /** 421 * Creates and returns a parser for the given URL, which the caller MUST close. 422 * 423 * <p> 424 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless 425 * you close the {@code url}. 426 * </p> 427 * 428 * @param url 429 * a URL. Must not be null. 430 * @param charset 431 * the charset for the resource. Must not be null. 432 * @param format 433 * the CSVFormat used for CSV parsing. Must not be null. 434 * @return a new parser 435 * @throws IllegalArgumentException 436 * If the parameters of the format are inconsistent or if either url, charset or format are null. 437 * @throws IOException 438 * If an I/O error occurs 439 * @throws CSVException Thrown on invalid input. 440 */ 441 @SuppressWarnings("resource") 442 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { 443 Objects.requireNonNull(url, "url"); 444 return parse(url.openStream(), charset, format); 445 } 446 447 private String headerComment; 448 449 private String trailerComment; 450 451 private final CSVFormat format; 452 453 private final Headers headers; 454 455 private final Lexer lexer; 456 457 private final CSVRecordIterator csvRecordIterator; 458 459 /** A record buffer for getRecord(). Grows as necessary and is reused. */ 460 private final List<String> recordList = new ArrayList<>(); 461 462 /** 463 * The next record number to assign. 464 */ 465 private long recordNumber; 466 467 /** 468 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination 469 * with {@link #recordNumber}. 470 */ 471 private final long characterOffset; 472 473 private final Token reusableToken = new Token(); 474 475 /** 476 * Constructs a new instance using the given {@link CSVFormat} 477 * 478 * <p> 479 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 480 * unless you close the {@code reader}. 481 * </p> 482 * 483 * @param reader 484 * a Reader containing CSV-formatted input. Must not be null. 485 * @param format 486 * the CSVFormat used for CSV parsing. Must not be null. 487 * @throws IllegalArgumentException 488 * If the parameters of the format are inconsistent or if either reader or format are null. 489 * @throws IOException 490 * If there is a problem reading the header or skipping the first record 491 * @throws CSVException Thrown on invalid input. 492 * @deprecated Will be removed in the next major version, use {@link Builder#get()}. 493 */ 494 @Deprecated 495 public CSVParser(final Reader reader, final CSVFormat format) throws IOException { 496 this(reader, format, 0, 1); 497 } 498 499 /** 500 * Constructs a new instance using the given {@link CSVFormat} 501 * 502 * <p> 503 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 504 * unless you close the {@code reader}. 505 * </p> 506 * 507 * @param reader 508 * a Reader containing CSV-formatted input. Must not be null. 509 * @param format 510 * the CSVFormat used for CSV parsing. Must not be null. 511 * @param characterOffset 512 * Lexer offset when the parser does not start parsing at the beginning of the source. 513 * @param recordNumber 514 * The next record number to assign. 515 * @throws IllegalArgumentException 516 * If the parameters of the format are inconsistent or if either the reader or format is null. 517 * @throws IOException 518 * if there is a problem reading the header or skipping the first record 519 * @throws CSVException on invalid input. 520 * @since 1.1 521 * @deprecated Will be private in the next major version, use {@link Builder#get()}. 522 */ 523 @Deprecated 524 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) 525 throws IOException { 526 this(reader, format, characterOffset, recordNumber, null, false); 527 } 528 529 /** 530 * Constructs a new instance using the given {@link CSVFormat} 531 * 532 * <p> 533 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 534 * unless you close the {@code reader}. 535 * </p> 536 * 537 * @param reader 538 * a Reader containing CSV-formatted input. Must not be null. 539 * @param format 540 * the CSVFormat used for CSV parsing. Must not be null. 541 * @param characterOffset 542 * Lexer offset when the parser does not start parsing at the beginning of the source. 543 * @param recordNumber 544 * The next record number to assign. 545 * @param charset 546 * The character encoding to be used for the reader when enableByteTracking is true. 547 * @param trackBytes 548 * {@code true} to enable byte tracking for the parser; {@code false} to disable it. 549 * @throws IllegalArgumentException 550 * If the parameters of the format are inconsistent or if either the reader or format is null. 551 * @throws IOException 552 * If there is a problem reading the header or skipping the first record. 553 * @throws CSVException Thrown on invalid input. 554 */ 555 private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, 556 final Charset charset, final boolean trackBytes) 557 throws IOException { 558 Objects.requireNonNull(reader, "reader"); 559 Objects.requireNonNull(format, "format"); 560 this.format = format.copy(); 561 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes)); 562 this.csvRecordIterator = new CSVRecordIterator(); 563 this.headers = createHeaders(); 564 this.characterOffset = characterOffset; 565 this.recordNumber = recordNumber - 1; 566 } 567 568 private void addRecordValue(final boolean lastRecord) { 569 final String input = format.trim(reusableToken.content.toString()); 570 if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) { 571 return; 572 } 573 recordList.add(handleNull(input)); 574 } 575 576 /** 577 * Closes resources. 578 * 579 * @throws IOException 580 * If an I/O error occurs 581 */ 582 @Override 583 public void close() throws IOException { 584 lexer.close(); 585 } 586 587 private Map<String, Integer> createEmptyHeaderMap() { 588 return format.getIgnoreHeaderCase() ? 589 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : 590 new LinkedHashMap<>(); 591 } 592 593 /** 594 * Creates the name to index mapping if the format defines a header. 595 * 596 * @return null if the format has no header. 597 * @throws IOException if there is a problem reading the header or skipping the first record 598 * @throws CSVException on invalid input. 599 */ 600 private Headers createHeaders() throws IOException { 601 Map<String, Integer> hdrMap = null; 602 List<String> headerNames = null; 603 final String[] formatHeader = format.getHeader(); 604 if (formatHeader != null) { 605 hdrMap = createEmptyHeaderMap(); 606 String[] headerRecord = null; 607 if (formatHeader.length == 0) { 608 // read the header from the first line of the file 609 final CSVRecord nextRecord = nextRecord(); 610 if (nextRecord != null) { 611 headerRecord = nextRecord.values(); 612 headerComment = nextRecord.getComment(); 613 } 614 } else { 615 if (format.getSkipHeaderRecord()) { 616 final CSVRecord nextRecord = nextRecord(); 617 if (nextRecord != null) { 618 headerComment = nextRecord.getComment(); 619 } 620 } 621 headerRecord = formatHeader; 622 } 623 624 // build the name to index mappings 625 if (headerRecord != null) { 626 // Track an occurrence of a null, empty or blank header. 627 boolean observedMissing = false; 628 for (int i = 0; i < headerRecord.length; i++) { 629 final String header = headerRecord[i]; 630 final boolean blankHeader = CSVFormat.isBlank(header); 631 if (blankHeader && !format.getAllowMissingColumnNames()) { 632 throw new IllegalArgumentException( 633 "A header name is missing in " + Arrays.toString(headerRecord)); 634 } 635 636 final boolean containsHeader = blankHeader ? observedMissing : hdrMap.containsKey(header); 637 final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode(); 638 final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL; 639 final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY; 640 641 if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) { 642 throw new IllegalArgumentException( 643 String.format( 644 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().", 645 header, Arrays.toString(headerRecord))); 646 } 647 observedMissing |= blankHeader; 648 if (header != null) { 649 hdrMap.put(header, Integer.valueOf(i)); // N.B. Explicit (un)boxing is intentional 650 if (headerNames == null) { 651 headerNames = new ArrayList<>(headerRecord.length); 652 } 653 headerNames.add(header); 654 } 655 } 656 } 657 } 658 // Make header names Collection immutable 659 return new Headers(hdrMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames)); 660 } 661 662 /** 663 * Gets the current line number in the input stream. 664 * 665 * <p> 666 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 667 * the record number. 668 * </p> 669 * 670 * @return current line number 671 */ 672 public long getCurrentLineNumber() { 673 return lexer.getCurrentLineNumber(); 674 } 675 676 /** 677 * Gets the first end-of-line string encountered. 678 * 679 * @return the first end-of-line string 680 * @since 1.5 681 */ 682 public String getFirstEndOfLine() { 683 return lexer.getFirstEol(); 684 } 685 686 /** 687 * Gets the header comment, if any. 688 * The header comment appears before the header record. 689 * 690 * @return the header comment for this stream, or null if no comment is available. 691 * @since 1.10.0 692 */ 693 public String getHeaderComment() { 694 return headerComment; 695 } 696 697 /** 698 * Gets a copy of the header map as defined in the CSVFormat's header. 699 * <p> 700 * The map keys are column names. The map values are 0-based indices. 701 * </p> 702 * <p> 703 * Note: The map can only provide a one-to-one mapping when the format did not 704 * contain null or duplicate column names. 705 * </p> 706 * 707 * @return a copy of the header map. 708 */ 709 public Map<String, Integer> getHeaderMap() { 710 if (headers.headerMap == null) { 711 return null; 712 } 713 final Map<String, Integer> map = createEmptyHeaderMap(); 714 map.putAll(headers.headerMap); 715 return map; 716 } 717 718 /** 719 * Gets the underlying header map. 720 * 721 * @return the underlying header map. 722 */ 723 Map<String, Integer> getHeaderMapRaw() { 724 return headers.headerMap; 725 } 726 727 /** 728 * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header. 729 * <p> 730 * Note: The list provides strings that can be used as keys in the header map. 731 * The list will not contain null column names if they were present in the input 732 * format. 733 * </p> 734 * 735 * @return read-only list of header names that iterates in column order. 736 * @see #getHeaderMap() 737 * @since 1.7 738 */ 739 public List<String> getHeaderNames() { 740 return Collections.unmodifiableList(headers.headerNames); 741 } 742 743 /** 744 * Gets the current record number in the input stream. 745 * 746 * <p> 747 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 748 * the line number. 749 * </p> 750 * 751 * @return current record number 752 */ 753 public long getRecordNumber() { 754 return recordNumber; 755 } 756 757 /** 758 * Parses the CSV input according to the given format and returns the content as a list of 759 * {@link CSVRecord CSVRecords}. 760 * 761 * <p> 762 * The returned content starts at the current parse-position in the stream. 763 * </p> 764 * 765 * @return list of {@link CSVRecord CSVRecords}, may be empty 766 * @throws UncheckedIOException 767 * on parse error or input read-failure 768 */ 769 public List<CSVRecord> getRecords() { 770 return stream().collect(Collectors.toList()); 771 } 772 773 /** 774 * Gets the trailer comment, if any. 775 * Trailer comments are located between the last record and EOF 776 * 777 * @return the trailer comment for this stream, or null if no comment is available. 778 * @since 1.10.0 779 */ 780 public String getTrailerComment() { 781 return trailerComment; 782 } 783 784 /** 785 * Handles whether the input is parsed as null 786 * 787 * @param input 788 * the cell data to further processed 789 * @return null if input is parsed as null, or input itself if the input isn't parsed as null 790 */ 791 private String handleNull(final String input) { 792 final boolean isQuoted = reusableToken.isQuoted; 793 final String nullString = format.getNullString(); 794 final boolean strictQuoteMode = isStrictQuoteMode(); 795 if (input.equals(nullString)) { 796 // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode 797 return strictQuoteMode && isQuoted ? input : null; 798 } 799 // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode 800 return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input; 801 } 802 803 /** 804 * Checks whether there is a header comment. 805 * The header comment appears before the header record. 806 * Note that if the parser's format has been given an explicit header 807 * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload) 808 * and the header record is not being skipped 809 * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments 810 * will be associated with the first record, not the header. 811 * 812 * @return true if this parser has seen a header comment, false otherwise 813 * @since 1.10.0 814 */ 815 public boolean hasHeaderComment() { 816 return headerComment != null; 817 } 818 819 /** 820 * Checks whether there is a trailer comment. 821 * Trailer comments are located between the last record and EOF. 822 * The trailer comments will only be available after the parser has 823 * finished processing this stream. 824 * 825 * @return true if this parser has seen a trailer comment, false otherwise 826 * @since 1.10.0 827 */ 828 public boolean hasTrailerComment() { 829 return trailerComment != null; 830 } 831 832 /** 833 * Tests whether this parser is closed. 834 * 835 * @return whether this parser is closed. 836 */ 837 public boolean isClosed() { 838 return lexer.isClosed(); 839 } 840 841 /** 842 * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}. 843 * 844 * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or 845 * {@link QuoteMode#NON_NUMERIC}. 846 */ 847 private boolean isStrictQuoteMode() { 848 return format.getQuoteMode() == QuoteMode.ALL_NON_NULL || 849 format.getQuoteMode() == QuoteMode.NON_NUMERIC; 850 } 851 852 /** 853 * Returns the record iterator. 854 * 855 * <p> 856 * An {@link IOException} caught during the iteration is re-thrown as an 857 * {@link IllegalStateException}. 858 * </p> 859 * <p> 860 * If the parser is closed, the iterator will not yield any more records. 861 * A call to {@link Iterator#hasNext()} will return {@code false} and 862 * a call to {@link Iterator#next()} will throw a 863 * {@link NoSuchElementException}. 864 * </p> 865 * <p> 866 * If it is necessary to construct an iterator which is usable after the 867 * parser is closed, one option is to extract all records as a list with 868 * {@link #getRecords()}, and return an iterator to that list. 869 * </p> 870 */ 871 @Override 872 public Iterator<CSVRecord> iterator() { 873 return csvRecordIterator; 874 } 875 876 /** 877 * Parses the next record from the current point in the stream. 878 * 879 * @return the record as an array of values, or {@code null} if the end of the stream has been reached 880 * @throws IOException on parse error or input read-failure 881 * @throws CSVException on invalid input. 882 */ 883 CSVRecord nextRecord() throws IOException { 884 CSVRecord result = null; 885 recordList.clear(); 886 StringBuilder sb = null; 887 final long startCharPosition = lexer.getCharacterPosition() + characterOffset; 888 final long startBytePosition = lexer.getBytesRead() + this.characterOffset; 889 do { 890 reusableToken.reset(); 891 lexer.nextToken(reusableToken); 892 switch (reusableToken.type) { 893 case TOKEN: 894 addRecordValue(false); 895 break; 896 case EORECORD: 897 addRecordValue(true); 898 break; 899 case EOF: 900 if (reusableToken.isReady) { 901 addRecordValue(true); 902 } else if (sb != null) { 903 trailerComment = sb.toString(); 904 } 905 break; 906 case INVALID: 907 throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber()); 908 case COMMENT: // Ignored currently 909 if (sb == null) { // first comment for this record 910 sb = new StringBuilder(); 911 } else { 912 sb.append(Constants.LF); 913 } 914 sb.append(reusableToken.content); 915 reusableToken.type = TOKEN; // Read another token 916 break; 917 default: 918 throw new CSVException("Unexpected Token type: %s", reusableToken.type); 919 } 920 } while (reusableToken.type == TOKEN); 921 922 if (!recordList.isEmpty()) { 923 recordNumber++; 924 final String comment = Objects.toString(sb, null); 925 result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, 926 recordNumber, startCharPosition, startBytePosition); 927 } 928 return result; 929 } 930 931 /** 932 * Returns a sequential {@code Stream} with this collection as its source. 933 * <p> 934 * If the parser is closed, the stream will not produce any more values. 935 * See the comments in {@link #iterator()}. 936 * </p> 937 * @return a sequential {@code Stream} with this collection as its source. 938 * @since 1.9.0 939 */ 940 public Stream<CSVRecord> stream() { 941 return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false); 942 } 943 944 }