View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Token.Type.TOKEN;
21  
22  import java.io.IOException;
23  import java.io.Reader;
24  import java.io.StringReader;
25  import java.util.ArrayList;
26  import java.util.Iterator;
27  import java.util.LinkedHashMap;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.NoSuchElementException;
31  
32  /**
33   * Parses CSV files according to the specified configuration.
34   *
35   * Because CSV appears in many different dialects, the parser supports many configuration settings by allowing the
36   * specification of a {@link CSVFormat}.
37   *
38   * <p>
39   * To parse a CSV input with tabs as separators, '"' (double-quote) as an optional value encapsulator, 
40   * and comments starting with '#', you write:
41   * </p>
42   *
43   * <pre>
44   * Reader in = new StringReader(&quot;a\tb\nc\td&quot;);
45   * Iterable&lt;CSVRecord&gt; parser = CSVFormat.newBuilder()
46   *     .withCommentStart('#')
47   *     .withDelimiter('\t')
48   *     .withQuoteChar('"').parse(in);
49   *  for (CSVRecord csvRecord : parse) {
50   *     ...
51   *  }
52   * </pre>
53   *
54   * <p>
55   * To parse CSV input in a given format like Excel, you write:
56   * </p>
57   *
58   * <pre>
59   * Reader in = new StringReader("a;b\nc;d");
60   * Iterable&lt;CSVRecord&gt; parser = CSVFormat.EXCEL.parse(in);
61   * for (CSVRecord record : parser) {
62   *     ...
63   * }
64   * </pre>
65   * <p>
66   * You may also get a List of records:
67   * </p>
68   * <pre>
69   * Reader in = new StringReader("a;b\nc;d");
70   * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
71   * List&lt;CSVRecord&gt; list = parser.getRecords();
72   * </pre>
73   * <p>
74   * Internal parser state is completely covered by the format and the reader-state.
75   * </p>
76   *
77   * <p>
78   * see <a href="package-summary.html">package documentation</a> for more details
79   * </p>
80   *
81   * @version $Id: CSVParser.java 1461307 2013-03-26 20:52:28Z ggregory $
82   */
83  public class CSVParser implements Iterable<CSVRecord> {
84  
85      private final Lexer lexer;
86      private final Map<String, Integer> headerMap;
87      private long recordNumber;
88  
89      // the following objects are shared to reduce garbage
90  
91      /** A record buffer for getRecord(). Grows as necessary and is reused. */
92      private final List<String> record = new ArrayList<String>();
93      private final Token reusableToken = new Token();
94  
95      /**
96       * CSV parser using the default {@link CSVFormat}.
97       *
98       * @param input
99       *            a Reader containing "csv-formatted" input
100      * @throws IllegalArgumentException
101      *             thrown if the parameters of the format are inconsistent
102      * @throws IOException
103      *             If an I/O error occurs
104      */
105     public CSVParser(final Reader input) throws IOException {
106         this(input, CSVFormat.DEFAULT);
107     }
108 
109     /**
110      * Customized CSV parser using the given {@link CSVFormat}
111      *
112      * @param input
113      *            a Reader containing CSV-formatted input
114      * @param format
115      *            the CSVFormat used for CSV parsing
116      * @throws IllegalArgumentException
117      *             thrown if the parameters of the format are inconsistent
118      * @throws IOException
119      *             If an I/O error occurs
120      */
121     public CSVParser(final Reader input, final CSVFormat format) throws IOException {
122         this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
123         this.headerMap = initializeHeader(format);
124     }
125 
126     /**
127      * Customized CSV parser using the given {@link CSVFormat}
128      *
129      * @param input
130      *            a String containing "csv-formatted" input
131      * @param format
132      *            the CSVFormat used for CSV parsing
133      * @throws IllegalArgumentException
134      *             thrown if the parameters of the format are inconsistent
135      * @throws IOException
136      *             If an I/O error occurs
137      */
138     public CSVParser(final String input, final CSVFormat format) throws IOException {
139         this(new StringReader(input), format);
140     }
141 
142     /**
143      * Returns a copy of the header map that iterates in column order.
144      * <p>
145      * The map keys are column names.
146      * The map values are 0-based indices.
147      *
148      * @return a copy of the header map that iterates in column order.
149      */
150     public Map<String, Integer> getHeaderMap() {
151         return new LinkedHashMap<String, Integer>(headerMap);
152     }
153 
154     /**
155      * Returns the current line number in the input stream.
156      * <p/>
157      * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the record number.
158      *
159      * @return current line number
160      */
161     public long getLineNumber() {
162         return lexer.getLineNumber();
163     }
164 
165     /**
166      * Returns the current record number in the input stream.
167      * <p/>
168      * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the line number.
169      *
170      * @return current line number
171      */
172     public long getRecordNumber() {
173         return recordNumber;
174     }
175 
176     /**
177      * Parses the next record from the current point in the stream.
178      *
179      * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
180      * @throws IOException
181      *             on parse error or input read-failure
182      */
183     CSVRecord nextRecord() throws IOException {
184         CSVRecord result = null;
185         record.clear();
186         StringBuilder sb = null;
187         do {
188             reusableToken.reset();
189             lexer.nextToken(reusableToken);
190             switch (reusableToken.type) {
191             case TOKEN:
192                 record.add(reusableToken.content.toString());
193                 break;
194             case EORECORD:
195                 record.add(reusableToken.content.toString());
196                 break;
197             case EOF:
198                 if (reusableToken.isReady) {
199                     record.add(reusableToken.content.toString());
200                 }
201                 break;
202             case INVALID:
203                 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
204             case COMMENT: // Ignored currently
205                 if (sb == null) { // first comment for this record
206                     sb = new StringBuilder();
207                 } else {
208                     sb.append("\n");
209                 }
210                 sb.append(reusableToken.content);
211                 reusableToken.type = TOKEN; // Read another token
212                 break;
213             }
214         } while (reusableToken.type == TOKEN);
215 
216         if (!record.isEmpty()) {
217             recordNumber++;
218             final String comment = sb == null ? null : sb.toString();
219             result = new CSVRecord(record.toArray(new String[record.size()]), headerMap, comment, this.recordNumber);
220         }
221         return result;
222     }
223 
224     /**
225      * Parses the CSV input according to the given format and returns the content as an array of {@link CSVRecord}
226      * entries.
227      * <p/>
228      * The returned content starts at the current parse-position in the stream.
229      *
230      * @return list of {@link CSVRecord} entries, may be empty
231      * @throws IOException
232      *             on parse error or input read-failure
233      */
234     public List<CSVRecord> getRecords() throws IOException {
235         final List<CSVRecord> records = new ArrayList<CSVRecord>();
236         CSVRecord rec;
237         while ((rec = nextRecord()) != null) {
238             records.add(rec);
239         }
240         return records;
241     }
242 
243     /**
244      * Initializes the name to index mapping if the format defines a header.
245      */
246     private Map<String, Integer> initializeHeader(final CSVFormat format) throws IOException {
247         Map<String, Integer> hdrMap = null;
248         if (format.getHeader() != null) {
249             hdrMap = new LinkedHashMap<String, Integer>();
250 
251             String[] header = null;
252             if (format.getHeader().length == 0) {
253                 // read the header from the first line of the file
254                 final CSVRecord record = nextRecord();
255                 if (record != null) {
256                     header = record.values();
257                 }
258             } else {
259                 header = format.getHeader();
260             }
261 
262             // build the name to index mappings
263             if (header != null) {
264                 for (int i = 0; i < header.length; i++) {
265                     hdrMap.put(header[i], Integer.valueOf(i));
266                 }
267             }
268         }
269         return hdrMap;
270     }
271 
272     /**
273      * Returns an iterator on the records. IOExceptions occurring during the iteration are wrapped in a
274      * RuntimeException.
275      */
276     public Iterator<CSVRecord> iterator() {
277         return new Iterator<CSVRecord>() {
278             private CSVRecord current;
279 
280             private CSVRecord getNextRecord() {
281                 try {
282                     return nextRecord();
283                 } catch (final IOException e) {
284                     // TODO: This is not great, throw an ISE instead?
285                     throw new RuntimeException(e);
286                 }
287             }
288 
289             public boolean hasNext() {
290                 if (current == null) {
291                     current = getNextRecord();
292                 }
293 
294                 return current != null;
295             }
296 
297             public CSVRecord next() {
298                 CSVRecord next = current;
299                 current = null;
300 
301                 if (next == null) {
302                     // hasNext() wasn't called before
303                     next = getNextRecord();
304                     if (next == null) {
305                         throw new NoSuchElementException("No more CSV records available");
306                     }
307                 }
308 
309                 return next;
310             }
311 
312             public void remove() {
313                 throw new UnsupportedOperationException();
314             }
315         };
316     }
317 }