001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import java.io.Closeable;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.URL;
028import java.nio.charset.Charset;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.Iterator;
032import java.util.LinkedHashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.NoSuchElementException;
036
037import static org.apache.commons.csv.Token.Type.*;
038
039/**
040 * Parses CSV files according to the specified format.
041 *
042 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
043 * specification of a {@link CSVFormat}.
044 *
045 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
046 *
047 * <h2>Creating instances</h2>
048 * <p>
049 * There are several static factory methods that can be used to create instances for various types of resources:
050 * </p>
051 * <ul>
052 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
053 *     <li>{@link #parse(String, CSVFormat)}</li>
054 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
055 * </ul>
056 * <p>
057 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
058 *
059 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
060 * </p>
061 * <pre>
062 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
063 *     ...
064 * }
065 * </pre>
066 *
067 * <h2>Parsing record wise</h2>
068 * <p>
069 * To parse a CSV input from a file, you write:
070 * </p>
071 *
072 * <pre>
073 * File csvData = new File(&quot;/path/to/csv&quot;);
074 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
075 * for (CSVRecord csvRecord : parser) {
076 *     ...
077 * }
078 * </pre>
079 *
080 * <p>
081 * This will read the parse the contents of the file using the
082 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
083 * </p>
084 *
085 * <p>
086 * To parse CSV input in a format like Excel, you write:
087 * </p>
088 *
089 * <pre>
090 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
091 * for (CSVRecord csvRecord : parser) {
092 *     ...
093 * }
094 * </pre>
095 *
096 * <p>
097 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
098 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
099 * </p>
100 *
101 * <h2>Parsing into memory</h2>
102 * <p>
103 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
104 * </p>
105 *
106 * <pre>
107 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
108 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
109 * List&lt;CSVRecord&gt; list = parser.getRecords();
110 * </pre>
111 *
112 * <p>
113 * There are two constraints that have to be kept in mind:
114 * </p>
115 *
116 * <ol>
117 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
118 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
119 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
120 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
121 * </ol>
122 *
123 * <h2>Notes</h2>
124 * <p>
125 * Internal parser state is completely covered by the format and the reader-state.
126 * </p>
127 *
128 * @version $Id: CSVParser.java 1637611 2014-11-08 23:38:48Z ggregory $
129 *
130 * @see <a href="package-summary.html">package documentation for more details</a>
131 */
132public final class CSVParser implements Iterable<CSVRecord>, Closeable {
133
134    /**
135     * Creates a parser for the given {@link File}.
136     *
137     * <p><strong>Note:</strong> This method internally creates a FileReader using
138     * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
139     * is executing the code. If this is insufficient create a URL to the file and use
140     * {@link #parse(URL, Charset, CSVFormat)}</p>
141     *
142     * @param file
143     *            a CSV file. Must not be null.
144     * @param charset
145     *            A charset
146     * @param format
147     *            the CSVFormat used for CSV parsing. Must not be null.
148     * @return a new parser
149     * @throws IllegalArgumentException
150     *             If the parameters of the format are inconsistent or if either file or format are null.
151     * @throws IOException
152     *             If an I/O error occurs
153     */
154    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
155        Assertions.notNull(file, "file");
156        Assertions.notNull(format, "format");
157        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
158    }
159
160    /**
161     * Creates a parser for the given {@link String}.
162     *
163     * @param string
164     *            a CSV string. Must not be null.
165     * @param format
166     *            the CSVFormat used for CSV parsing. Must not be null.
167     * @return a new parser
168     * @throws IllegalArgumentException
169     *             If the parameters of the format are inconsistent or if either string or format are null.
170     * @throws IOException
171     *             If an I/O error occurs
172     */
173    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
174        Assertions.notNull(string, "string");
175        Assertions.notNull(format, "format");
176
177        return new CSVParser(new StringReader(string), format);
178    }
179
180    /**
181     * Creates a parser for the given URL.
182     *
183     * <p>
184     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
185     * you close the {@code url}.
186     * </p>
187     *
188     * @param url
189     *            a URL. Must not be null.
190     * @param charset
191     *            the charset for the resource. Must not be null.
192     * @param format
193     *            the CSVFormat used for CSV parsing. Must not be null.
194     * @return a new parser
195     * @throws IllegalArgumentException
196     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
197     * @throws IOException
198     *             If an I/O error occurs
199     */
200    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
201        Assertions.notNull(url, "url");
202        Assertions.notNull(charset, "charset");
203        Assertions.notNull(format, "format");
204
205        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
206    }
207
208    // the following objects are shared to reduce garbage
209
210    private final CSVFormat format;
211
212    /** A mapping of column names to column indices */
213    private final Map<String, Integer> headerMap;
214
215    private final Lexer lexer;
216
217    /** A record buffer for getRecord(). Grows as necessary and is reused. */
218    private final List<String> record = new ArrayList<String>();
219
220    /**
221     * The next record number to assign.
222     */
223    private long recordNumber;
224
225    /**
226     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
227     * with {@link #recordNumber}.
228     */
229    private final long characterOffset;
230
231    private final Token reusableToken = new Token();
232
233    /**
234     * Customized CSV parser using the given {@link CSVFormat}
235     *
236     * <p>
237     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
238     * unless you close the {@code reader}.
239     * </p>
240     *
241     * @param reader
242     *            a Reader containing CSV-formatted input. Must not be null.
243     * @param format
244     *            the CSVFormat used for CSV parsing. Must not be null.
245     * @throws IllegalArgumentException
246     *             If the parameters of the format are inconsistent or if either reader or format are null.
247     * @throws IOException
248     *             If there is a problem reading the header or skipping the first record
249     */
250    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
251        this(reader, format, 0, 1);
252    }
253
254    /**
255     * Customized CSV parser using the given {@link CSVFormat}
256     *
257     * <p>
258     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
259     * unless you close the {@code reader}.
260     * </p>
261     *
262     * @param reader
263     *            a Reader containing CSV-formatted input. Must not be null.
264     * @param format
265     *            the CSVFormat used for CSV parsing. Must not be null.
266     * @param characterOffset
267     *            Lexer offset when the parser does not start parsing at the beginning of the source.
268     * @param recordNumber
269     *            The next record number to assign
270     * @throws IllegalArgumentException
271     *             If the parameters of the format are inconsistent or if either reader or format are null.
272     * @throws IOException
273     *             If there is a problem reading the header or skipping the first record
274     * @since 1.1
275     */
276    public CSVParser(final Reader reader, final CSVFormat format, long characterOffset, long recordNumber)
277            throws IOException {
278        Assertions.notNull(reader, "reader");
279        Assertions.notNull(format, "format");
280
281        this.format = format;
282        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
283        this.headerMap = this.initializeHeader();
284        this.characterOffset = characterOffset;
285        this.recordNumber = recordNumber - 1;
286    }
287
288    private void addRecordValue() {
289        final String input = this.reusableToken.content.toString();
290        final String nullString = this.format.getNullString();
291        if (nullString == null) {
292            this.record.add(input);
293        } else {
294            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
295        }
296    }
297
298    /**
299     * Closes resources.
300     *
301     * @throws IOException
302     *             If an I/O error occurs
303     */
304    @Override
305    public void close() throws IOException {
306        if (this.lexer != null) {
307            this.lexer.close();
308        }
309    }
310
311    /**
312     * Returns the current line number in the input stream.
313     *
314     * <p>
315     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
316     * the record number.
317     * </p>
318     *
319     * @return current line number
320     */
321    public long getCurrentLineNumber() {
322        return this.lexer.getCurrentLineNumber();
323    }
324
325    /**
326     * Returns a copy of the header map that iterates in column order.
327     * <p>
328     * The map keys are column names. The map values are 0-based indices.
329     * </p>
330     * @return a copy of the header map that iterates in column order.
331     */
332    public Map<String, Integer> getHeaderMap() {
333        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
334    }
335
336    /**
337     * Returns the current record number in the input stream.
338     *
339     * <p>
340     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
341     * the line number.
342     * </p>
343     *
344     * @return current record number
345     */
346    public long getRecordNumber() {
347        return this.recordNumber;
348    }
349
350    /**
351     * Parses the CSV input according to the given format and returns the content as a list of
352     * {@link CSVRecord CSVRecords}.
353     *
354     * <p>
355     * The returned content starts at the current parse-position in the stream.
356     * </p>
357     *
358     * @return list of {@link CSVRecord CSVRecords}, may be empty
359     * @throws IOException
360     *             on parse error or input read-failure
361     */
362    public List<CSVRecord> getRecords() throws IOException {
363        CSVRecord rec;
364        List<CSVRecord> records = new ArrayList<CSVRecord>();
365        while ((rec = this.nextRecord()) != null) {
366            records.add(rec);
367        }
368        return records;
369    }
370
371    /**
372     * Initializes the name to index mapping if the format defines a header.
373     *
374     * @return null if the format has no header.
375     * @throws IOException if there is a problem reading the header or skipping the first record
376     */
377    private Map<String, Integer> initializeHeader() throws IOException {
378        Map<String, Integer> hdrMap = null;
379        final String[] formatHeader = this.format.getHeader();
380        if (formatHeader != null) {
381            hdrMap = new LinkedHashMap<String, Integer>();
382
383            String[] headerRecord = null;
384            if (formatHeader.length == 0) {
385                // read the header from the first line of the file
386                final CSVRecord nextRecord = this.nextRecord();
387                if (nextRecord != null) {
388                    headerRecord = nextRecord.values();
389                }
390            } else {
391                if (this.format.getSkipHeaderRecord()) {
392                    this.nextRecord();
393                }
394                headerRecord = formatHeader;
395            }
396
397            // build the name to index mappings
398            if (headerRecord != null) {
399                for (int i = 0; i < headerRecord.length; i++) {
400                    final String header = headerRecord[i];
401                    final boolean containsHeader = hdrMap.containsKey(header);
402                    final boolean emptyHeader = header == null || header.trim().isEmpty();
403                    if (containsHeader &&
404                            (!emptyHeader || (emptyHeader && !this.format.getAllowMissingColumnNames()))) {
405                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
406                                "\" in " + Arrays.toString(headerRecord));
407                    }
408                    hdrMap.put(header, Integer.valueOf(i));
409                }
410            }
411        }
412        return hdrMap;
413    }
414
415    /**
416     * Gets whether this parser is closed.
417     *
418     * @return whether this parser is closed.
419     */
420    public boolean isClosed() {
421        return this.lexer.isClosed();
422    }
423
424    /**
425     * Returns an iterator on the records.
426     *
427     * <p>IOExceptions occurring during the iteration are wrapped in a
428     * RuntimeException.
429     * If the parser is closed a call to {@code next()} will throw a
430     * NoSuchElementException.</p>
431     */
432    @Override
433    public Iterator<CSVRecord> iterator() {
434        return new Iterator<CSVRecord>() {
435            private CSVRecord current;
436
437            private CSVRecord getNextRecord() {
438                try {
439                    return CSVParser.this.nextRecord();
440                } catch (final IOException e) {
441                    // TODO: This is not great, throw an ISE instead?
442                    throw new RuntimeException(e);
443                }
444            }
445
446            @Override
447            public boolean hasNext() {
448                if (CSVParser.this.isClosed()) {
449                    return false;
450                }
451                if (this.current == null) {
452                    this.current = this.getNextRecord();
453                }
454
455                return this.current != null;
456            }
457
458            @Override
459            public CSVRecord next() {
460                if (CSVParser.this.isClosed()) {
461                    throw new NoSuchElementException("CSVParser has been closed");
462                }
463                CSVRecord next = this.current;
464                this.current = null;
465
466                if (next == null) {
467                    // hasNext() wasn't called before
468                    next = this.getNextRecord();
469                    if (next == null) {
470                        throw new NoSuchElementException("No more CSV records available");
471                    }
472                }
473
474                return next;
475            }
476
477            @Override
478            public void remove() {
479                throw new UnsupportedOperationException();
480            }
481        };
482    }
483
484    /**
485     * Parses the next record from the current point in the stream.
486     *
487     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
488     * @throws IOException
489     *             on parse error or input read-failure
490     */
491    CSVRecord nextRecord() throws IOException {
492        CSVRecord result = null;
493        this.record.clear();
494        StringBuilder sb = null;
495        final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
496        do {
497            this.reusableToken.reset();
498            this.lexer.nextToken(this.reusableToken);
499            switch (this.reusableToken.type) {
500            case TOKEN:
501                this.addRecordValue();
502                break;
503            case EORECORD:
504                this.addRecordValue();
505                break;
506            case EOF:
507                if (this.reusableToken.isReady) {
508                    this.addRecordValue();
509                }
510                break;
511            case INVALID:
512                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
513            case COMMENT: // Ignored currently
514                if (sb == null) { // first comment for this record
515                    sb = new StringBuilder();
516                } else {
517                    sb.append(Constants.LF);
518                }
519                sb.append(this.reusableToken.content);
520                this.reusableToken.type = TOKEN; // Read another token
521                break;
522            default:
523                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
524            }
525        } while (this.reusableToken.type == TOKEN);
526
527        if (!this.record.isEmpty()) {
528            this.recordNumber++;
529            final String comment = sb == null ? null : sb.toString();
530            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
531                    this.recordNumber, startCharPosition);
532        }
533        return result;
534    }
535
536}