CSVParser.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  *   https://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */

  19. package org.apache.commons.csv;

  20. import static org.apache.commons.csv.Token.Type.TOKEN;

  21. import java.io.Closeable;
  22. import java.io.File;
  23. import java.io.IOException;
  24. import java.io.InputStream;
  25. import java.io.InputStreamReader;
  26. import java.io.Reader;
  27. import java.io.StringReader;
  28. import java.io.UncheckedIOException;
  29. import java.net.URL;
  30. import java.nio.charset.Charset;
  31. import java.nio.file.Files;
  32. import java.nio.file.Path;
  33. import java.util.ArrayList;
  34. import java.util.Arrays;
  35. import java.util.Collections;
  36. import java.util.Iterator;
  37. import java.util.LinkedHashMap;
  38. import java.util.List;
  39. import java.util.Map;
  40. import java.util.NoSuchElementException;
  41. import java.util.Objects;
  42. import java.util.Spliterator;
  43. import java.util.Spliterators;
  44. import java.util.TreeMap;
  45. import java.util.stream.Collectors;
  46. import java.util.stream.Stream;
  47. import java.util.stream.StreamSupport;

  48. import org.apache.commons.io.Charsets;
  49. import org.apache.commons.io.build.AbstractStreamBuilder;
  50. import org.apache.commons.io.function.Uncheck;

  51. /**
  52.  * Parses CSV files according to the specified format.
  53.  *
  54.  * Because CSV appears in many different dialects, the parser supports many formats by allowing the
  55.  * specification of a {@link CSVFormat}.
  56.  *
  57.  * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
  58.  *
  59.  * <h2>Creating instances</h2>
  60.  * <p>
  61.  * There are several static factory methods that can be used to create instances for various types of resources:
  62.  * </p>
  63.  * <ul>
  64.  *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
  65.  *     <li>{@link #parse(String, CSVFormat)}</li>
  66.  *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
  67.  * </ul>
  68.  * <p>
  69.  * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
  70.  *
  71.  * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
  72.  * </p>
  73.  * <pre>
  74.  * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
  75.  *     ...
  76.  * }
  77.  * </pre>
  78.  *
  79.  * <h2>Parsing record wise</h2>
  80.  * <p>
  81.  * To parse a CSV input from a file, you write:
  82.  * </p>
  83.  *
  84.  * <pre>{@code
  85.  * File csvData = new File("/path/to/csv");
  86.  * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
  87.  * for (CSVRecord csvRecord : parser) {
  88.  *     ...
  89.  * }}
  90.  * </pre>
  91.  *
  92.  * <p>
  93.  * This will read the parse the contents of the file using the
  94.  * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
  95.  * </p>
  96.  *
  97.  * <p>
  98.  * To parse CSV input in a format like Excel, you write:
  99.  * </p>
  100.  *
  101.  * <pre>
  102.  * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
  103.  * for (CSVRecord csvRecord : parser) {
  104.  *     ...
  105.  * }
  106.  * </pre>
  107.  *
  108.  * <p>
  109.  * If the predefined formats don't match the format at hand, custom formats can be defined. More information about
  110.  * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
  111.  * </p>
  112.  *
  113.  * <h2>Parsing into memory</h2>
  114.  * <p>
  115.  * If parsing record-wise is not desired, the contents of the input can be read completely into memory.
  116.  * </p>
  117.  *
  118.  * <pre>{@code
  119.  * Reader in = new StringReader("a;b\nc;d");
  120.  * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
  121.  * List<CSVRecord> list = parser.getRecords();
  122.  * }</pre>
  123.  *
  124.  * <p>
  125.  * There are two constraints that have to be kept in mind:
  126.  * </p>
  127.  *
  128.  * <ol>
  129.  *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
  130.  *     the input, those records will not end up in the in-memory representation of your CSV data.</li>
  131.  *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
  132.  *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
  133.  * </ol>
  134.  *
  135.  * <h2>Notes</h2>
  136.  * <p>
  137.  * The internal parser state is completely covered by the format and the reader state.
  138.  * </p>
  139.  *
  140.  * @see <a href="package-summary.html">package documentation for more details</a>
  141.  */
  142. public final class CSVParser implements Iterable<CSVRecord>, Closeable {

  143.     /**
  144.      * Builds a new {@link CSVParser}.
  145.      *
  146.      * @since 1.13.0
  147.      */
  148.     public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {

  149.         private CSVFormat format;
  150.         private long characterOffset;
  151.         private long recordNumber = 1;
  152.         private boolean trackBytes;

  153.         /**
  154.          * Constructs a new instance.
  155.          */
  156.         protected Builder() {
  157.             // empty
  158.         }

  159.         @SuppressWarnings("resource")
  160.         @Override
  161.         public CSVParser get() throws IOException {
  162.             return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
  163.         }

  164.         /**
  165.          * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
  166.          *
  167.          * @param characterOffset the lexer offset.
  168.          * @return this instance.
  169.          */
  170.         public Builder setCharacterOffset(final long characterOffset) {
  171.             this.characterOffset = characterOffset;
  172.             return asThis();
  173.         }

  174.         /**
  175.          * Sets the CSV format. A copy of the given format is kept.
  176.          *
  177.          * @param format the CSV format, {@code null} resets to {@link CSVFormat#DEFAULT}.
  178.          * @return this instance.
  179.          */
  180.         public Builder setFormat(final CSVFormat format) {
  181.             this.format = CSVFormat.copy(format);
  182.             return asThis();
  183.         }

  184.         /**
  185.          * Sets the next record number to assign, defaults to {@code 1}.
  186.          *
  187.          * @param recordNumber the next record number to assign.
  188.          * @return this instance.
  189.          */
  190.         public Builder setRecordNumber(final long recordNumber) {
  191.             this.recordNumber = recordNumber;
  192.             return asThis();
  193.         }

  194.         /**
  195.          * Sets whether to enable byte tracking for the parser.
  196.          *
  197.          * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
  198.          * @return this instance.
  199.          * @since 1.13.0
  200.          */
  201.         public Builder setTrackBytes(final boolean trackBytes) {
  202.             this.trackBytes = trackBytes;
  203.             return asThis();
  204.         }

  205.     }

  206.     final class CSVRecordIterator implements Iterator<CSVRecord> {
  207.         private CSVRecord current;

  208.         /**
  209.          * Gets the next record or null at the end of stream or max rows read.
  210.          *
  211.          * @throws IOException  on parse error or input read-failure
  212.          * @throws CSVException on invalid input.
  213.          * @return the next record, or {@code null} if the end of the stream has been reached.
  214.          */
  215.         private CSVRecord getNextRecord() {
  216.             CSVRecord record = null;
  217.             if (format.useRow(recordNumber + 1)) {
  218.                 record = Uncheck.get(CSVParser.this::nextRecord);
  219.             }
  220.             return record;
  221.         }

  222.         @Override
  223.         public boolean hasNext() {
  224.             if (isClosed()) {
  225.                 return false;
  226.             }
  227.             if (current == null) {
  228.                 current = getNextRecord();
  229.             }
  230.             return current != null;
  231.         }

  232.         @Override
  233.         public CSVRecord next() {
  234.             if (isClosed()) {
  235.                 throw new NoSuchElementException("CSVParser has been closed");
  236.             }
  237.             CSVRecord next = current;
  238.             current = null;
  239.             if (next == null) {
  240.                 // hasNext() wasn't called before
  241.                 next = getNextRecord();
  242.                 if (next == null) {
  243.                     throw new NoSuchElementException("No more CSV records available");
  244.                 }
  245.             }
  246.             return next;
  247.         }

  248.         @Override
  249.         public void remove() {
  250.             throw new UnsupportedOperationException();
  251.         }
  252.     }
  253.     /**
  254.      * Header information based on name and position.
  255.      */
  256.     private static final class Headers {

  257.         /**
  258.          * Header column positions (0-based)
  259.          */
  260.         final Map<String, Integer> headerMap;

  261.         /**
  262.          * Header names in column order
  263.          */
  264.         final List<String> headerNames;

  265.         Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
  266.             this.headerMap = headerMap;
  267.             this.headerNames = headerNames;
  268.         }
  269.     }

  270.     /**
  271.      * Creates a new builder.
  272.      *
  273.      * @return a new builder.
  274.      * @since 1.13.0
  275.      */
  276.     public static Builder builder() {
  277.         return new Builder();
  278.     }

  279.     /**
  280.      * Creates a parser for the given {@link File}.
  281.      *
  282.      * @param file
  283.      *            a CSV file. Must not be null.
  284.      * @param charset
  285.      *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
  286.      * @param format
  287.      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
  288.      * @return a new parser
  289.      * @throws IllegalArgumentException
  290.      *             If the parameters of the format are inconsistent.
  291.      * @throws IOException
  292.      *             If an I/O error occurs
  293.      * @throws CSVException Thrown on invalid CSV input data.
  294.      * @throws NullPointerException if {@code file} is {@code null}.
  295.      */
  296.     public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
  297.         Objects.requireNonNull(file, "file");
  298.         return parse(file.toPath(), charset, format);
  299.     }

  300.     /**
  301.      * Creates a CSV parser using the given {@link CSVFormat}.
  302.      *
  303.      * <p>
  304.      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
  305.      * unless you close the {@code reader}.
  306.      * </p>
  307.      *
  308.      * @param inputStream
  309.      *            an InputStream containing CSV-formatted input, {@code null} maps to {@link CSVFormat#DEFAULT}.
  310.      * @param charset
  311.      *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
  312.      * @param format
  313.      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
  314.      * @return a new CSVParser configured with the given reader and format.
  315.      * @throws IllegalArgumentException
  316.      *             If the parameters of the format are inconsistent or if either reader or format are null.
  317.      * @throws IOException
  318.      *             If there is a problem reading the header or skipping the first record
  319.      * @throws CSVException Thrown on invalid CSV input data.
  320.      * @since 1.5
  321.      */
  322.     public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
  323.             throws IOException {
  324.         return parse(new InputStreamReader(inputStream, Charsets.toCharset(charset)), format);
  325.     }

  326.     /**
  327.      * Creates and returns a parser for the given {@link Path}, which the caller MUST close.
  328.      *
  329.      * @param path
  330.      *            a CSV file. Must not be null.
  331.      * @param charset
  332.      *            The Charset to decode the given file, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
  333.      * @param format
  334.      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
  335.      * @return a new parser
  336.      * @throws IllegalArgumentException
  337.      *             If the parameters of the format are inconsistent.
  338.      * @throws IOException
  339.      *             If an I/O error occurs
  340.      * @throws CSVException Thrown on invalid CSV input data.
  341.      * @throws NullPointerException if {@code path} is {@code null}.
  342.      * @since 1.5
  343.      */
  344.     @SuppressWarnings("resource")
  345.     public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
  346.         Objects.requireNonNull(path, "path");
  347.         return parse(Files.newInputStream(path), charset, format);
  348.     }

  349.     /**
  350.      * Creates a CSV parser using the given {@link CSVFormat}
  351.      *
  352.      * <p>
  353.      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
  354.      * unless you close the {@code reader}.
  355.      * </p>
  356.      *
  357.      * @param reader
  358.      *            a Reader containing CSV-formatted input. Must not be null.
  359.      * @param format
  360.      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
  361.      * @return a new CSVParser configured with the given reader and format.
  362.      * @throws IllegalArgumentException
  363.      *             If the parameters of the format are inconsistent or if either reader or format are null.
  364.      * @throws IOException
  365.      *             If there is a problem reading the header or skipping the first record
  366.      * @throws CSVException Thrown on invalid CSV input data.
  367.      * @since 1.5
  368.      */
  369.     public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
  370.         return builder().setReader(reader).setFormat(format).get();
  371.     }

  372.     /**
  373.      * Creates a parser for the given {@link String}.
  374.      *
  375.      * @param string
  376.      *            a CSV string. Must not be null.
  377.      * @param format
  378.      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
  379.      * @return a new parser
  380.      * @throws IllegalArgumentException
  381.      *             If the parameters of the format are inconsistent.
  382.      * @throws IOException
  383.      *             If an I/O error occurs
  384.      * @throws CSVException Thrown on invalid CSV input data.
  385.      * @throws NullPointerException if {@code string} is {@code null}.
  386.      */
  387.     public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
  388.         Objects.requireNonNull(string, "string");
  389.         return parse(new StringReader(string), format);
  390.     }

  391.     /**
  392.      * Creates and returns a parser for the given URL, which the caller MUST close.
  393.      *
  394.      * <p>
  395.      * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
  396.      * you close the {@code url}.
  397.      * </p>
  398.      *
  399.      * @param url
  400.      *            a URL. Must not be null.
  401.      * @param charset
  402.      *            the charset for the resource, {@code null} maps to the {@link Charset#defaultCharset() default Charset}.
  403.      * @param format
  404.      *            the CSVFormat used for CSV parsing, {@code null} maps to {@link CSVFormat#DEFAULT}.
  405.      * @return a new parser
  406.      * @throws IllegalArgumentException
  407.      *             If the parameters of the format are inconsistent.
  408.      * @throws IOException
  409.      *             If an I/O error occurs
  410.      * @throws CSVException Thrown on invalid CSV input data.
  411.      * @throws NullPointerException if {@code url} is {@code null}.
  412.      */
  413.     @SuppressWarnings("resource")
  414.     public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
  415.         Objects.requireNonNull(url, "url");
  416.         return parse(url.openStream(), charset, format);
  417.     }

  418.     private String headerComment;

  419.     private String trailerComment;

  420.     private final CSVFormat format;

  421.     private final Headers headers;

  422.     private final Lexer lexer;

  423.     private final CSVRecordIterator csvRecordIterator;

  424.     /** A record buffer for getRecord(). Grows as necessary and is reused. */
  425.     private final List<String> recordList = new ArrayList<>();

  426.     /**
  427.      * The next record number to assign.
  428.      */
  429.     private long recordNumber;

  430.     /**
  431.      * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
  432.      * with {@link #recordNumber}.
  433.      */
  434.     private final long characterOffset;

  435.     private final Token reusableToken = new Token();

  436.     /**
  437.      * Constructs a new instance using the given {@link CSVFormat}.
  438.      *
  439.      * <p>
  440.      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
  441.      * unless you close the {@code reader}.
  442.      * </p>
  443.      *
  444.      * @param reader
  445.      *            a Reader containing CSV-formatted input. Must not be null.
  446.      * @param format
  447.      *            the CSVFormat used for CSV parsing. Must not be null.
  448.      * @throws IllegalArgumentException
  449.      *             If the parameters of the format are inconsistent or if either reader or format are null.
  450.      * @throws IOException
  451.      *             If there is a problem reading the header or skipping the first record
  452.      * @throws CSVException Thrown on invalid CSV input data.
  453.      * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
  454.      */
  455.     @Deprecated
  456.     public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
  457.         this(reader, format, 0, 1);
  458.     }

  459.     /**
  460.      * Constructs a new instance using the given {@link CSVFormat}.
  461.      *
  462.      * <p>
  463.      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
  464.      * unless you close the {@code reader}.
  465.      * </p>
  466.      *
  467.      * @param reader
  468.      *            a Reader containing CSV-formatted input. Must not be null.
  469.      * @param format
  470.      *            the CSVFormat used for CSV parsing. Must not be null.
  471.      * @param characterOffset
  472.      *            Lexer offset when the parser does not start parsing at the beginning of the source.
  473.      * @param recordNumber
  474.      *            The next record number to assign.
  475.      * @throws IllegalArgumentException
  476.      *             If the parameters of the format are inconsistent or if either the reader or format is null.
  477.      * @throws IOException
  478.      *             if there is a problem reading the header or skipping the first record
  479.      * @throws CSVException on invalid input.
  480.      * @since 1.1
  481.      * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
  482.      */
  483.     @Deprecated
  484.     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException {
  485.         this(reader, format, characterOffset, recordNumber, null, false);
  486.     }

  487.     /**
  488.      * Constructs a new instance using the given {@link CSVFormat}.
  489.      *
  490.      * <p>
  491.      * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
  492.      * unless you close the {@code reader}.
  493.      * </p>
  494.      *
  495.      * @param reader
  496.      *            a Reader containing CSV-formatted input. Must not be null.
  497.      * @param format
  498.      *            the CSVFormat used for CSV parsing. Must not be null.
  499.      * @param characterOffset
  500.      *            Lexer offset when the parser does not start parsing at the beginning of the source.
  501.      * @param recordNumber
  502.      *            The next record number to assign.
  503.      * @param charset
  504.      *            The character encoding to be used for the reader when enableByteTracking is true.
  505.      * @param trackBytes
  506.      *           {@code true} to enable byte tracking for the parser; {@code false} to disable it.
  507.      * @throws IllegalArgumentException
  508.      *             If the parameters of the format are inconsistent or if either the reader or format is null.
  509.      * @throws IOException
  510.      *             If there is a problem reading the header or skipping the first record.
  511.      * @throws CSVException Thrown on invalid CSV input data.
  512.      */
  513.     private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
  514.         final Charset charset, final boolean trackBytes)
  515.         throws IOException {
  516.         Objects.requireNonNull(reader, "reader");
  517.         Objects.requireNonNull(format, "format");
  518.         this.format = format.copy();
  519.         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
  520.         this.csvRecordIterator = new CSVRecordIterator();
  521.         this.headers = createHeaders();
  522.         this.characterOffset = characterOffset;
  523.         this.recordNumber = recordNumber - 1;
  524.     }

  525.     private void addRecordValue(final boolean lastRecord) {
  526.         final String input = format.trim(reusableToken.content.toString());
  527.         if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
  528.             return;
  529.         }
  530.         recordList.add(handleNull(input));
  531.     }

  532.     /**
  533.      * Closes resources.
  534.      *
  535.      * @throws IOException
  536.      *             If an I/O error occurs
  537.      */
  538.     @Override
  539.     public void close() throws IOException {
  540.         lexer.close();
  541.     }

  542.     private Map<String, Integer> createEmptyHeaderMap() {
  543.         return format.getIgnoreHeaderCase() ?
  544.                 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
  545.                 new LinkedHashMap<>();
  546.     }

  547.     /**
  548.      * Creates the name to index mapping if the format defines a header.
  549.      *
  550.      * @return null if the format has no header.
  551.      * @throws IOException if there is a problem reading the header or skipping the first record
  552.      * @throws CSVException on invalid input.
  553.      */
  554.     private Headers createHeaders() throws IOException {
  555.         Map<String, Integer> headerMap = null;
  556.         List<String> headerNames = null;
  557.         final String[] formatHeader = format.getHeader();
  558.         if (formatHeader != null) {
  559.             headerMap = createEmptyHeaderMap();
  560.             String[] headerRecord = null;
  561.             if (formatHeader.length == 0) {
  562.                 // read the header from the first line of the file
  563.                 final CSVRecord nextRecord = nextRecord();
  564.                 if (nextRecord != null) {
  565.                     headerRecord = nextRecord.values();
  566.                     headerComment = nextRecord.getComment();
  567.                 }
  568.             } else {
  569.                 if (format.getSkipHeaderRecord()) {
  570.                     final CSVRecord nextRecord = nextRecord();
  571.                     if (nextRecord != null) {
  572.                         headerComment = nextRecord.getComment();
  573.                     }
  574.                 }
  575.                 headerRecord = formatHeader;
  576.             }
  577.             // build the name to index mappings
  578.             if (headerRecord != null) {
  579.                 // Track an occurrence of a null, empty or blank header.
  580.                 boolean observedMissing = false;
  581.                 for (int i = 0; i < headerRecord.length; i++) {
  582.                     final String header = headerRecord[i];
  583.                     final boolean blankHeader = CSVFormat.isBlank(header);
  584.                     if (blankHeader && !format.getAllowMissingColumnNames()) {
  585.                         throw new IllegalArgumentException("A header name is missing in " + Arrays.toString(headerRecord));
  586.                     }
  587.                     final boolean containsHeader = blankHeader ? observedMissing : headerMap.containsKey(header);
  588.                     final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
  589.                     final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
  590.                     final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
  591.                     if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
  592.                         throw new IllegalArgumentException(String.format(
  593.                                 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
  594.                                 header, Arrays.toString(headerRecord)));
  595.                     }
  596.                     observedMissing |= blankHeader;
  597.                     if (header != null) {
  598.                         headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional
  599.                         if (headerNames == null) {
  600.                             headerNames = new ArrayList<>(headerRecord.length);
  601.                         }
  602.                         headerNames.add(header);
  603.                     }
  604.                 }
  605.             }
  606.         }
  607.         // Make header names Collection immutable
  608.         return new Headers(headerMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
  609.     }

  610.     /**
  611.      * Gets the current line number in the input stream.
  612.      *
  613.      * <p>
  614.      * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
  615.      * the record number.
  616.      * </p>
  617.      *
  618.      * @return current line number.
  619.      */
  620.     public long getCurrentLineNumber() {
  621.         return lexer.getCurrentLineNumber();
  622.     }

  623.     /**
  624.      * Gets the first end-of-line string encountered.
  625.      *
  626.      * @return the first end-of-line string.
  627.      * @since 1.5
  628.      */
  629.     public String getFirstEndOfLine() {
  630.         return lexer.getFirstEol();
  631.     }

  632.     /**
  633.      * Gets the header comment, if any.
  634.      * The header comment appears before the header record.
  635.      *
  636.      * @return the header comment for this stream, or null if no comment is available.
  637.      * @since 1.10.0
  638.      */
  639.     public String getHeaderComment() {
  640.         return headerComment;
  641.     }

  642.     /**
  643.      * Gets a copy of the header map as defined in the CSVFormat's header.
  644.      * <p>
  645.      * The map keys are column names. The map values are 0-based indices.
  646.      * </p>
  647.      * <p>
  648.      * <strong>Note:</strong> The map can only provide a one-to-one mapping when the format did not
  649.      * contain null or duplicate column names.
  650.      * </p>
  651.      *
  652.      * @return a copy of the header map.
  653.      */
  654.     public Map<String, Integer> getHeaderMap() {
  655.         if (headers.headerMap == null) {
  656.             return null;
  657.         }
  658.         final Map<String, Integer> map = createEmptyHeaderMap();
  659.         map.putAll(headers.headerMap);
  660.         return map;
  661.     }

  662.     /**
  663.      * Gets the underlying header map.
  664.      *
  665.      * @return the underlying header map.
  666.      */
  667.     Map<String, Integer> getHeaderMapRaw() {
  668.         return headers.headerMap;
  669.     }

  670.     /**
  671.      * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
  672.      * <p>
  673.      * Note: The list provides strings that can be used as keys in the header map.
  674.      * The list will not contain null column names if they were present in the input
  675.      * format.
  676.      * </p>
  677.      *
  678.      * @return read-only list of header names that iterates in column order.
  679.      * @see #getHeaderMap()
  680.      * @since 1.7
  681.      */
  682.     public List<String> getHeaderNames() {
  683.         return Collections.unmodifiableList(headers.headerNames);
  684.     }

  685.     /**
  686.      * Gets the current record number in the input stream.
  687.      *
  688.      * <p>
  689.      * <strong>Note:</strong> If your CSV input has multi-line values, the returned number does not correspond to
  690.      * the line number.
  691.      * </p>
  692.      *
  693.      * @return current record number
  694.      */
  695.     public long getRecordNumber() {
  696.         return recordNumber;
  697.     }

  698.     /**
  699.      * Parses the CSV input according to the given format and returns the content as a list of
  700.      * {@link CSVRecord CSVRecords}.
  701.      *
  702.      * <p>
  703.      * The returned content starts at the current parse-position in the stream.
  704.      * </p>
  705.      * <p>
  706.      * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows this method produces.
  707.      * </p>
  708.      *
  709.      * @return list of {@link CSVRecord CSVRecords}, may be empty
  710.      * @throws UncheckedIOException
  711.      *             on parse error or input read-failure
  712.      */
  713.     public List<CSVRecord> getRecords() {
  714.         return stream().collect(Collectors.toList());
  715.     }

  716.     /**
  717.      * Gets the trailer comment, if any.
  718.      * Trailer comments are located between the last record and EOF
  719.      *
  720.      * @return the trailer comment for this stream, or null if no comment is available.
  721.      * @since 1.10.0
  722.      */
  723.     public String getTrailerComment() {
  724.         return trailerComment;
  725.     }

  726.     /**
  727.      * Handles whether the input is parsed as null
  728.      *
  729.      * @param input
  730.      *           the cell data to further processed
  731.      * @return null if input is parsed as null, or input itself if the input isn't parsed as null
  732.      */
  733.     private String handleNull(final String input) {
  734.         final boolean isQuoted = reusableToken.isQuoted;
  735.         final String nullString = format.getNullString();
  736.         final boolean strictQuoteMode = isStrictQuoteMode();
  737.         if (input.equals(nullString)) {
  738.             // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
  739.             return strictQuoteMode && isQuoted ? input : null;
  740.         }
  741.         // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
  742.         return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
  743.     }

  744.     /**
  745.      * Checks whether there is a header comment.
  746.      * The header comment appears before the header record.
  747.      * Note that if the parser's format has been given an explicit header
  748.      * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
  749.      * and the header record is not being skipped
  750.      * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
  751.      * will be associated with the first record, not the header.
  752.      *
  753.      * @return true if this parser has seen a header comment, false otherwise
  754.      * @since 1.10.0
  755.      */
  756.     public boolean hasHeaderComment() {
  757.         return headerComment != null;
  758.     }

  759.     /**
  760.      * Checks whether there is a trailer comment.
  761.      * Trailer comments are located between the last record and EOF.
  762.      * The trailer comments will only be available after the parser has
  763.      * finished processing this stream.
  764.      *
  765.      * @return true if this parser has seen a trailer comment, false otherwise
  766.      * @since 1.10.0
  767.      */
  768.     public boolean hasTrailerComment() {
  769.         return trailerComment != null;
  770.     }

  771.     /**
  772.      * Tests whether this parser is closed.
  773.      *
  774.      * @return whether this parser is closed.
  775.      */
  776.     public boolean isClosed() {
  777.         return lexer.isClosed();
  778.     }

  779.     /**
  780.      * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
  781.      *
  782.      * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
  783.      *         {@link QuoteMode#NON_NUMERIC}.
  784.      */
  785.     private boolean isStrictQuoteMode() {
  786.         return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
  787.                format.getQuoteMode() == QuoteMode.NON_NUMERIC;
  788.     }

  789.     /**
  790.      * Returns the record iterator.
  791.      *
  792.      * <p>
  793.      * An {@link IOException} caught during the iteration is re-thrown as an
  794.      * {@link IllegalStateException}.
  795.      * </p>
  796.      * <p>
  797.      * If the parser is closed, the iterator will not yield any more records.
  798.      * A call to {@link Iterator#hasNext()} will return {@code false} and
  799.      * a call to {@link Iterator#next()} will throw a
  800.      * {@link NoSuchElementException}.
  801.      * </p>
  802.      * <p>
  803.      * If it is necessary to construct an iterator which is usable after the
  804.      * parser is closed, one option is to extract all records as a list with
  805.      * {@link #getRecords()}, and return an iterator to that list.
  806.      * </p>
  807.      * <p>
  808.      * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows an Iterator produces.
  809.      * </p>
  810.      */
  811.     @Override
  812.     public Iterator<CSVRecord> iterator() {
  813.         return csvRecordIterator;
  814.     }

  815.     /**
  816.      * Parses the next record from the current point in the stream.
  817.      *
  818.      * @return the record as an array of values, or {@code null} if the end of the stream has been reached.
  819.      * @throws IOException  on parse error or input read-failure.
  820.      * @throws CSVException on invalid CSV input data.
  821.      */
  822.     CSVRecord nextRecord() throws IOException {
  823.         CSVRecord result = null;
  824.         recordList.clear();
  825.         StringBuilder sb = null;
  826.         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
  827.         final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
  828.         do {
  829.             reusableToken.reset();
  830.             lexer.nextToken(reusableToken);
  831.             switch (reusableToken.type) {
  832.             case TOKEN:
  833.                 addRecordValue(false);
  834.                 break;
  835.             case EORECORD:
  836.                 addRecordValue(true);
  837.                 break;
  838.             case EOF:
  839.                 if (reusableToken.isReady) {
  840.                     addRecordValue(true);
  841.                 } else if (sb != null) {
  842.                     trailerComment = sb.toString();
  843.                 }
  844.                 break;
  845.             case INVALID:
  846.                 throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber());
  847.             case COMMENT: // Ignored currently
  848.                 if (sb == null) { // first comment for this record
  849.                     sb = new StringBuilder();
  850.                 } else {
  851.                     sb.append(Constants.LF);
  852.                 }
  853.                 sb.append(reusableToken.content);
  854.                 reusableToken.type = TOKEN; // Read another token
  855.                 break;
  856.             default:
  857.                 throw new CSVException("Unexpected Token type: %s", reusableToken.type);
  858.             }
  859.         } while (reusableToken.type == TOKEN);

  860.         if (!recordList.isEmpty()) {
  861.             recordNumber++;
  862.             final String comment = Objects.toString(sb, null);
  863.             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
  864.                 recordNumber, startCharPosition, startBytePosition);
  865.         }
  866.         return result;
  867.     }

  868.     /**
  869.      * Returns a sequential {@code Stream} with this collection as its source.
  870.      * <p>
  871.      * If the parser is closed, the stream will not produce any more values.
  872.      * See the comments in {@link #iterator()}.
  873.      * </p>
  874.      * <p>
  875.      * You can use {@link CSVFormat.Builder#setMaxRows(long)} to limit how many rows a Stream produces.
  876.      * </p>
  877.      *
  878.      * @return a sequential {@code Stream} with this collection as its source.
  879.      * @since 1.9.0
  880.      */
  881.     public Stream<CSVRecord> stream() {
  882.         return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
  883.     }

  884. }