001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import java.io.Closeable;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.URL;
028import java.nio.charset.Charset;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.Iterator;
032import java.util.LinkedHashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.NoSuchElementException;
036
037import static org.apache.commons.csv.Token.Type.*;
038
039/**
040 * Parses CSV files according to the specified format.
041 *
042 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
043 * specification of a {@link CSVFormat}.
044 *
045 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
046 *
047 * <h2>Creating instances</h2>
048 * <p>
049 * There are several static factory methods that can be used to create instances for various types of resources:
050 * </p>
051 * <ul>
052 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
053 *     <li>{@link #parse(String, CSVFormat)}</li>
054 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
055 * </ul>
056 * <p>
057 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
058 *
059 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
060 * </p>
061 * <pre>
062 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
063 *     ...
064 * }
065 * </pre>
066 *
067 * <h2>Parsing record wise</h2>
068 * <p>
069 * To parse a CSV input from a file, you write:
070 * </p>
071 *
072 * <pre>
073 * File csvData = new File(&quot;/path/to/csv&quot;);
074 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
075 * for (CSVRecord csvRecord : parser) {
076 *     ...
077 * }
078 * </pre>
079 *
080 * <p>
081 * This will read the parse the contents of the file using the
082 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
083 * </p>
084 *
085 * <p>
086 * To parse CSV input in a format like Excel, you write:
087 * </p>
088 *
089 * <pre>
090 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
091 * for (CSVRecord csvRecord : parser) {
092 *     ...
093 * }
094 * </pre>
095 *
096 * <p>
097 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
098 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
099 * </p>
100 *
101 * <h2>Parsing into memory</h2>
102 * <p>
103 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
104 * </p>
105 *
106 * <pre>
107 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
108 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
109 * List&lt;CSVRecord&gt; list = parser.getRecords();
110 * </pre>
111 *
112 * <p>
113 * There are two constraints that have to be kept in mind:
114 * </p>
115 *
116 * <ol>
117 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
118 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
119 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
120 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
121 * </ol>
122 *
123 * <h2>Notes</h2>
124 * <p>
125 * Internal parser state is completely covered by the format and the reader-state.
126 * </p>
127 *
128 * @version $Id: CSVParser.java 1617069 2014-08-10 08:53:42Z britter $
129 *
130 * @see <a href="package-summary.html">package documentation for more details</a>
131 */
132public final class CSVParser implements Iterable<CSVRecord>, Closeable {
133
134    /**
135     * Creates a parser for the given {@link File}.
136     *
137     * <p><strong>Note:</strong> This method internally creates a FileReader using
138     * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
139     * is executing the code. If this is insufficient create a URL to the file and use
140     * {@link #parse(URL, Charset, CSVFormat)}</p>
141     *
142     * @param file
143     *            a CSV file. Must not be null.
144     * @param charset
145     *            A charset
146     * @param format
147     *            the CSVFormat used for CSV parsing. Must not be null.
148     * @return a new parser
149     * @throws IllegalArgumentException
150     *             If the parameters of the format are inconsistent or if either file or format are null.
151     * @throws IOException
152     *             If an I/O error occurs
153     */
154    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
155        Assertions.notNull(file, "file");
156        Assertions.notNull(format, "format");
157        // Use the default Charset explicitly
158        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
159    }
160
161    /**
162     * Creates a parser for the given {@link String}.
163     *
164     * @param string
165     *            a CSV string. Must not be null.
166     * @param format
167     *            the CSVFormat used for CSV parsing. Must not be null.
168     * @return a new parser
169     * @throws IllegalArgumentException
170     *             If the parameters of the format are inconsistent or if either string or format are null.
171     * @throws IOException
172     *             If an I/O error occurs
173     */
174    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
175        Assertions.notNull(string, "string");
176        Assertions.notNull(format, "format");
177
178        return new CSVParser(new StringReader(string), format);
179    }
180
181    /**
182     * Creates a parser for the given URL.
183     *
184     * <p>
185     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
186     * you close the {@code url}.
187     * </p>
188     *
189     * @param url
190     *            a URL. Must not be null.
191     * @param charset
192     *            the charset for the resource. Must not be null.
193     * @param format
194     *            the CSVFormat used for CSV parsing. Must not be null.
195     * @return a new parser
196     * @throws IllegalArgumentException
197     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
198     * @throws IOException
199     *             If an I/O error occurs
200     */
201    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
202        Assertions.notNull(url, "url");
203        Assertions.notNull(charset, "charset");
204        Assertions.notNull(format, "format");
205
206        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
207    }
208
209    // the following objects are shared to reduce garbage
210
211    private final CSVFormat format;
212
213    /** A mapping of column names to column indices */
214    private final Map<String, Integer> headerMap;
215
216    private final Lexer lexer;
217
218    /** A record buffer for getRecord(). Grows as necessary and is reused. */
219    private final List<String> record = new ArrayList<String>();
220
221    private long recordNumber;
222
223    private final Token reusableToken = new Token();
224
225    /**
226     * Customized CSV parser using the given {@link CSVFormat}
227     *
228     * <p>
229     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
230     * unless you close the {@code reader}.
231     * </p>
232     *
233     * @param reader
234     *            a Reader containing CSV-formatted input. Must not be null.
235     * @param format
236     *            the CSVFormat used for CSV parsing. Must not be null.
237     * @throws IllegalArgumentException
238     *             If the parameters of the format are inconsistent or if either reader or format are null.
239     * @throws IOException
240     *             If there is a problem reading the header or skipping the first record
241     */
242    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
243        Assertions.notNull(reader, "reader");
244        Assertions.notNull(format, "format");
245
246        this.format = format;
247        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
248        this.headerMap = this.initializeHeader();
249    }
250
251    private void addRecordValue() {
252        final String input = this.reusableToken.content.toString();
253        final String nullString = this.format.getNullString();
254        if (nullString == null) {
255            this.record.add(input);
256        } else {
257            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
258        }
259    }
260
261    /**
262     * Closes resources.
263     *
264     * @throws IOException
265     *             If an I/O error occurs
266     */
267    public void close() throws IOException {
268        if (this.lexer != null) {
269            this.lexer.close();
270        }
271    }
272
273    /**
274     * Returns the current line number in the input stream.
275     *
276     * <p>
277     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
278     * the record number.
279     * </p>
280     *
281     * @return current line number
282     */
283    public long getCurrentLineNumber() {
284        return this.lexer.getCurrentLineNumber();
285    }
286
287    /**
288     * Returns a copy of the header map that iterates in column order.
289     * <p>
290     * The map keys are column names. The map values are 0-based indices.
291     * </p>
292     * @return a copy of the header map that iterates in column order.
293     */
294    public Map<String, Integer> getHeaderMap() {
295        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
296    }
297
298    /**
299     * Returns the current record number in the input stream.
300     *
301     * <p>
302     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
303     * the line number.
304     * </p>
305     *
306     * @return current line number
307     */
308    public long getRecordNumber() {
309        return this.recordNumber;
310    }
311
312    /**
313     * Parses the CSV input according to the given format and returns the content as a list of
314     * {@link CSVRecord CSVRecords}.
315     *
316     * <p>
317     * The returned content starts at the current parse-position in the stream.
318     * </p>
319     *
320     * @return list of {@link CSVRecord CSVRecords}, may be empty
321     * @throws IOException
322     *             on parse error or input read-failure
323     */
324    public List<CSVRecord> getRecords() throws IOException {
325        CSVRecord rec;
326        List<CSVRecord> records = new ArrayList<CSVRecord>();
327        while ((rec = this.nextRecord()) != null) {
328            records.add(rec);
329        }
330        return records;
331    }
332
333    /**
334     * Initializes the name to index mapping if the format defines a header.
335     *
336     * @return null if the format has no header.
337     * @throws IOException if there is a problem reading the header or skipping the first record
338     */
339    private Map<String, Integer> initializeHeader() throws IOException {
340        Map<String, Integer> hdrMap = null;
341        final String[] formatHeader = this.format.getHeader();
342        if (formatHeader != null) {
343            hdrMap = new LinkedHashMap<String, Integer>();
344
345            String[] headerRecord = null;
346            if (formatHeader.length == 0) {
347                // read the header from the first line of the file
348                final CSVRecord nextRecord = this.nextRecord();
349                if (nextRecord != null) {
350                    headerRecord = nextRecord.values();
351                }
352            } else {
353                if (this.format.getSkipHeaderRecord()) {
354                    this.nextRecord();
355                }
356                headerRecord = formatHeader;
357            }
358
359            // build the name to index mappings
360            if (headerRecord != null) {
361                for (int i = 0; i < headerRecord.length; i++) {
362                    final String header = headerRecord[i];
363                    final boolean containsHeader = hdrMap.containsKey(header);
364                    final boolean emptyHeader = header == null || header.trim().isEmpty();
365                    if (containsHeader &&
366                            (!emptyHeader || (emptyHeader && !this.format.getAllowMissingColumnNames()))) {
367                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
368                                "\" in " + Arrays.toString(headerRecord));
369                    }
370                    hdrMap.put(header, Integer.valueOf(i));
371                }
372            }
373        }
374        return hdrMap;
375    }
376
377    public boolean isClosed() {
378        return this.lexer.isClosed();
379    }
380
381    /**
382     * Returns an iterator on the records.
383     *
384     * <p>IOExceptions occurring during the iteration are wrapped in a
385     * RuntimeException.
386     * If the parser is closed a call to {@code next()} will throw a
387     * NoSuchElementException.</p>
388     */
389    public Iterator<CSVRecord> iterator() {
390        return new Iterator<CSVRecord>() {
391            private CSVRecord current;
392
393            private CSVRecord getNextRecord() {
394                try {
395                    return CSVParser.this.nextRecord();
396                } catch (final IOException e) {
397                    // TODO: This is not great, throw an ISE instead?
398                    throw new RuntimeException(e);
399                }
400            }
401
402            public boolean hasNext() {
403                if (CSVParser.this.isClosed()) {
404                    return false;
405                }
406                if (this.current == null) {
407                    this.current = this.getNextRecord();
408                }
409
410                return this.current != null;
411            }
412
413            public CSVRecord next() {
414                if (CSVParser.this.isClosed()) {
415                    throw new NoSuchElementException("CSVParser has been closed");
416                }
417                CSVRecord next = this.current;
418                this.current = null;
419
420                if (next == null) {
421                    // hasNext() wasn't called before
422                    next = this.getNextRecord();
423                    if (next == null) {
424                        throw new NoSuchElementException("No more CSV records available");
425                    }
426                }
427
428                return next;
429            }
430
431            public void remove() {
432                throw new UnsupportedOperationException();
433            }
434        };
435    }
436
437    /**
438     * Parses the next record from the current point in the stream.
439     *
440     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
441     * @throws IOException
442     *             on parse error or input read-failure
443     */
444    CSVRecord nextRecord() throws IOException {
445        CSVRecord result = null;
446        this.record.clear();
447        StringBuilder sb = null;
448        do {
449            this.reusableToken.reset();
450            this.lexer.nextToken(this.reusableToken);
451            switch (this.reusableToken.type) {
452            case TOKEN:
453                this.addRecordValue();
454                break;
455            case EORECORD:
456                this.addRecordValue();
457                break;
458            case EOF:
459                if (this.reusableToken.isReady) {
460                    this.addRecordValue();
461                }
462                break;
463            case INVALID:
464                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
465            case COMMENT: // Ignored currently
466                if (sb == null) { // first comment for this record
467                    sb = new StringBuilder();
468                } else {
469                    sb.append(Constants.LF);
470                }
471                sb.append(this.reusableToken.content);
472                this.reusableToken.type = TOKEN; // Read another token
473                break;
474            default:
475                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
476            }
477        } while (this.reusableToken.type == TOKEN);
478
479        if (!this.record.isEmpty()) {
480            this.recordNumber++;
481            final String comment = sb == null ? null : sb.toString();
482            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
483                    this.recordNumber);
484        }
485        return result;
486    }
487
488}