001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.csv;
019
020import static org.apache.commons.csv.Token.Type.TOKEN;
021
022import java.io.Closeable;
023import java.io.File;
024import java.io.FileReader;
025import java.io.IOException;
026import java.io.InputStreamReader;
027import java.io.Reader;
028import java.io.StringReader;
029import java.net.URL;
030import java.nio.charset.Charset;
031import java.util.ArrayList;
032import java.util.Iterator;
033import java.util.LinkedHashMap;
034import java.util.List;
035import java.util.Map;
036import java.util.NoSuchElementException;
037
038/**
039 * Parses CSV files according to the specified format.
040 *
041 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
042 * specification of a {@link CSVFormat}.
043 *
044 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
045 *
046 * <h4>Creating instances</h4>
047 * There are several static factory methods that can be used to create instances for various types of resources:
048 * <p>
049 * <ul>
050 *     <li>{@link #parse(java.io.File, CSVFormat)}</li>
051 *     <li>{@link #parse(String, CSVFormat)}</li>
052 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
053 * </ul>
054 * </p>
055 * <p>
056 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
057 * 
058 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
059 * </p>
060 * <pre>
061 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
062 *     ...
063 * }
064 * </pre>
065 *
066 * <h4>Parsing record wise</h4>
067 * <p>
068 * To parse a CSV input from a file, you write:
069 * </p>
070 *
071 * <pre>
072 * File csvData = new File(&quot;/path/to/csv&quot;);
073 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
074 * for (CSVRecord csvRecord : parser) {
075 *     ...
076 * }
077 * </pre>
078 *
079 * <p>
080 * This will read the parse the contents of the file using the
081 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
082 * </p>
083 *
084 * <p>
085 * To parse CSV input in a format like Excel, you write:
086 * </p>
087 *
088 * <pre>
089 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
090 * for (CSVRecord csvRecord : parser) {
091 *     ...
092 * }
093 * </pre>
094 *
095 * <p>
096 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
097 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
098 * </p>
099 *
100 * <h4>Parsing into memory</h4>
101 * <p>
102 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
103 * </p>
104 *
105 * <pre>
106 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
107 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
108 * List&lt;CSVRecord&gt; list = parser.getRecords();
109 * </pre>
110 *
111 * <p>
112 * There are two constraints that have to be kept in mind:
113 * </p>
114 *
115 * <p>
116 * <ol>
117 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
118 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
119 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
120 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
121 * </ol>
122 * </p>
123 *
124 * <h4>Notes</h4>
125 * <p>
126 * Internal parser state is completely covered by the format and the reader-state.
127 * </p>
128 *
129 * @version $Id: CSVParser.java 1559908 2014-01-21 02:44:30Z ggregory $
130 *
131 * @see <a href="package-summary.html">package documentation for more details</a>
132 */
133public final class CSVParser implements Iterable<CSVRecord>, Closeable {
134
135    /**
136     * Creates a parser for the given {@link File}.
137     *
138     * @param file
139     *            a CSV file. Must not be null.
140     * @param format
141     *            the CSVFormat used for CSV parsing. Must not be null.
142     * @return a new parser
143     * @throws IllegalArgumentException
144     *             If the parameters of the format are inconsistent or if either file or format are null.
145     * @throws IOException
146     *             If an I/O error occurs
147     */
148    public static CSVParser parse(final File file, final CSVFormat format) throws IOException {
149        Assertions.notNull(file, "file");
150        Assertions.notNull(format, "format");
151
152        return new CSVParser(new FileReader(file), format);
153    }
154
155    /**
156     * Creates a parser for the given {@link String}.
157     *
158     * @param string
159     *            a CSV string. Must not be null.
160     * @param format
161     *            the CSVFormat used for CSV parsing. Must not be null.
162     * @return a new parser
163     * @throws IllegalArgumentException
164     *             If the parameters of the format are inconsistent or if either string or format are null.
165     * @throws IOException
166     *             If an I/O error occurs
167     */
168    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
169        Assertions.notNull(string, "string");
170        Assertions.notNull(format, "format");
171
172        return new CSVParser(new StringReader(string), format);
173    }
174
175    /**
176     * Creates a parser for the given URL.
177     *
178     * <p>
179     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
180     * you close the {@code url}.
181     * </p>
182     *
183     * @param url
184     *            a URL. Must not be null.
185     * @param charset
186     *            the charset for the resource. Must not be null.
187     * @param format
188     *            the CSVFormat used for CSV parsing. Must not be null.
189     * @return a new parser
190     * @throws IllegalArgumentException
191     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
192     * @throws IOException
193     *             If an I/O error occurs
194     */
195    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
196        Assertions.notNull(url, "url");
197        Assertions.notNull(charset, "charset");
198        Assertions.notNull(format, "format");
199
200        return new CSVParser(new InputStreamReader(url.openStream(),
201                             charset == null ? Charset.forName("UTF-8") : charset), format);
202    }
203
204    // the following objects are shared to reduce garbage
205
206    private final CSVFormat format;
207
208    /** A mapping of column names to column indices */
209    private final Map<String, Integer> headerMap;
210
211    private final Lexer lexer;
212
213    /** A record buffer for getRecord(). Grows as necessary and is reused. */
214    private final List<String> record = new ArrayList<String>();
215
216    private long recordNumber;
217
218    private final Token reusableToken = new Token();
219
220    /**
221     * Customized CSV parser using the given {@link CSVFormat}
222     *
223     * <p>
224     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
225     * unless you close the {@code reader}.
226     * </p>
227     *
228     * @param reader
229     *            a Reader containing CSV-formatted input. Must not be null.
230     * @param format
231     *            the CSVFormat used for CSV parsing. Must not be null.
232     * @throws IllegalArgumentException
233     *             If the parameters of the format are inconsistent or if either reader or format are null.
234     * @throws IOException
235     *             If an I/O error occurs
236     */
237    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
238        Assertions.notNull(reader, "reader");
239        Assertions.notNull(format, "format");
240
241        format.validate();
242        this.format = format;
243        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
244        this.headerMap = this.initializeHeader();
245    }
246
247    private void addRecordValue() {
248        final String input = this.reusableToken.content.toString();
249        final String nullString = this.format.getNullString();
250        if (nullString == null) {
251            this.record.add(input);
252        } else {
253            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
254        }
255    }
256
257    /**
258     * Closes resources.
259     *
260     * @throws IOException
261     *             If an I/O error occurs
262     */
263    public void close() throws IOException {
264        if (this.lexer != null) {
265            this.lexer.close();
266        }
267    }
268
269    /**
270     * Returns the current line number in the input stream.
271     * <p/>
272     * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the record number.
273     *
274     * @return current line number
275     */
276    public long getCurrentLineNumber() {
277        return this.lexer.getCurrentLineNumber();
278    }
279
280    /**
281     * Returns a copy of the header map that iterates in column order.
282     * <p>
283     * The map keys are column names. The map values are 0-based indices.
284     * </p>
285     * @return a copy of the header map that iterates in column order.
286     */
287    public Map<String, Integer> getHeaderMap() {
288        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
289    }
290
291    /**
292     * Returns the current record number in the input stream.
293     * <p/>
294     * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the line number.
295     *
296     * @return current line number
297     */
298    public long getRecordNumber() {
299        return this.recordNumber;
300    }
301
302    /**
303     * Parses the CSV input according to the given format and returns the content as a list of
304     * {@link CSVRecord CSVRecords}.
305     * <p/>
306     * The returned content starts at the current parse-position in the stream.
307     *
308     * @return list of {@link CSVRecord CSVRecords}, may be empty
309     * @throws IOException
310     *             on parse error or input read-failure
311     */
312    public List<CSVRecord> getRecords() throws IOException {
313        final List<CSVRecord> records = new ArrayList<CSVRecord>();
314        CSVRecord rec;
315        while ((rec = this.nextRecord()) != null) {
316            records.add(rec);
317        }
318        return records;
319    }
320
321    /**
322     * Initializes the name to index mapping if the format defines a header.
323     * 
324     * @return null if the format has no header.
325     */
326    private Map<String, Integer> initializeHeader() throws IOException {
327        Map<String, Integer> hdrMap = null;
328        final String[] formatHeader = this.format.getHeader();
329        if (formatHeader != null) {
330            hdrMap = new LinkedHashMap<String, Integer>();
331
332            String[] header = null;
333            if (formatHeader.length == 0) {
334                // read the header from the first line of the file
335                final CSVRecord nextRecord = this.nextRecord();
336                if (nextRecord != null) {
337                    header = nextRecord.values();
338                }
339            } else {
340                if (this.format.getSkipHeaderRecord()) {
341                    this.nextRecord();
342                }
343                header = formatHeader;
344            }
345
346            // build the name to index mappings
347            if (header != null) {
348                for (int i = 0; i < header.length; i++) {
349                    hdrMap.put(header[i], Integer.valueOf(i));
350                }
351            }
352        }
353        return hdrMap;
354    }
355
356    public boolean isClosed() {
357        return this.lexer.isClosed();
358    }
359
360    /**
361     * Returns an iterator on the records.
362     *
363     * <p>IOExceptions occurring during the iteration are wrapped in a
364     * RuntimeException.
365     * If the parser is closed a call to {@code next()} will throw a
366     * NoSuchElementException.</p>
367     */
368    public Iterator<CSVRecord> iterator() {
369        return new Iterator<CSVRecord>() {
370            private CSVRecord current;
371
372            private CSVRecord getNextRecord() {
373                try {
374                    return CSVParser.this.nextRecord();
375                } catch (final IOException e) {
376                    // TODO: This is not great, throw an ISE instead?
377                    throw new RuntimeException(e);
378                }
379            }
380
381            public boolean hasNext() {
382                if (CSVParser.this.isClosed()) {
383                    return false;
384                }
385                if (this.current == null) {
386                    this.current = this.getNextRecord();
387                }
388
389                return this.current != null;
390            }
391
392            public CSVRecord next() {
393                if (CSVParser.this.isClosed()) {
394                    throw new NoSuchElementException("CSVParser has been closed");
395                }
396                CSVRecord next = this.current;
397                this.current = null;
398
399                if (next == null) {
400                    // hasNext() wasn't called before
401                    next = this.getNextRecord();
402                    if (next == null) {
403                        throw new NoSuchElementException("No more CSV records available");
404                    }
405                }
406
407                return next;
408            }
409
410            public void remove() {
411                throw new UnsupportedOperationException();
412            }
413        };
414    }
415
416    /**
417     * Parses the next record from the current point in the stream.
418     *
419     * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
420     * @throws IOException
421     *             on parse error or input read-failure
422     */
423    CSVRecord nextRecord() throws IOException {
424        CSVRecord result = null;
425        this.record.clear();
426        StringBuilder sb = null;
427        do {
428            this.reusableToken.reset();
429            this.lexer.nextToken(this.reusableToken);
430            switch (this.reusableToken.type) {
431            case TOKEN:
432                this.addRecordValue();
433                break;
434            case EORECORD:
435                this.addRecordValue();
436                break;
437            case EOF:
438                if (this.reusableToken.isReady) {
439                    this.addRecordValue();
440                }
441                break;
442            case INVALID:
443                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
444            case COMMENT: // Ignored currently
445                if (sb == null) { // first comment for this record
446                    sb = new StringBuilder();
447                } else {
448                    sb.append(Constants.LF);
449                }
450                sb.append(this.reusableToken.content);
451                this.reusableToken.type = TOKEN; // Read another token
452                break;
453            }
454        } while (this.reusableToken.type == TOKEN);
455
456        if (!this.record.isEmpty()) {
457            this.recordNumber++;
458            final String comment = sb == null ? null : sb.toString();
459            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
460                    this.recordNumber);
461        }
462        return result;
463    }
464
465}