1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.csv;
19
20 import static org.apache.commons.csv.Token.Type.TOKEN;
21
22 import java.io.IOException;
23 import java.io.Reader;
24 import java.io.StringReader;
25 import java.util.ArrayList;
26 import java.util.Iterator;
27 import java.util.LinkedHashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.NoSuchElementException;
31
32 /**
33 * Parses CSV files according to the specified configuration.
34 *
35 * Because CSV appears in many different dialects, the parser supports many configuration settings by allowing the
36 * specification of a {@link CSVFormat}.
37 *
38 * <p>
39 * To parse a CSV input with tabs as separators, '"' (double-quote) as an optional value encapsulator,
40 * and comments starting with '#', you write:
41 * </p>
42 *
43 * <pre>
44 * Reader in = new StringReader("a\tb\nc\td");
45 * Iterable<CSVRecord> parser = CSVFormat.newBuilder()
46 * .withCommentStart('#')
47 * .withDelimiter('\t')
48 * .withQuoteChar('"').parse(in);
49 * for (CSVRecord csvRecord : parse) {
50 * ...
51 * }
52 * </pre>
53 *
54 * <p>
55 * To parse CSV input in a given format like Excel, you write:
56 * </p>
57 *
58 * <pre>
59 * Reader in = new StringReader("a;b\nc;d");
60 * Iterable<CSVRecord> parser = CSVFormat.EXCEL.parse(in);
61 * for (CSVRecord record : parser) {
62 * ...
63 * }
64 * </pre>
65 * <p>
66 * You may also get a List of records:
67 * </p>
68 * <pre>
69 * Reader in = new StringReader("a;b\nc;d");
70 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
71 * List<CSVRecord> list = parser.getRecords();
72 * </pre>
73 * <p>
74 * Internal parser state is completely covered by the format and the reader-state.
75 * </p>
76 *
77 * <p>
78 * see <a href="package-summary.html">package documentation</a> for more details
79 * </p>
80 *
81 * @version $Id: CSVParser.java 1461307 2013-03-26 20:52:28Z ggregory $
82 */
83 public class CSVParser implements Iterable<CSVRecord> {
84
85 private final Lexer lexer;
86 private final Map<String, Integer> headerMap;
87 private long recordNumber;
88
89 // the following objects are shared to reduce garbage
90
91 /** A record buffer for getRecord(). Grows as necessary and is reused. */
92 private final List<String> record = new ArrayList<String>();
93 private final Token reusableToken = new Token();
94
95 /**
96 * CSV parser using the default {@link CSVFormat}.
97 *
98 * @param input
99 * a Reader containing "csv-formatted" input
100 * @throws IllegalArgumentException
101 * thrown if the parameters of the format are inconsistent
102 * @throws IOException
103 * If an I/O error occurs
104 */
105 public CSVParser(final Reader input) throws IOException {
106 this(input, CSVFormat.DEFAULT);
107 }
108
109 /**
110 * Customized CSV parser using the given {@link CSVFormat}
111 *
112 * @param input
113 * a Reader containing CSV-formatted input
114 * @param format
115 * the CSVFormat used for CSV parsing
116 * @throws IllegalArgumentException
117 * thrown if the parameters of the format are inconsistent
118 * @throws IOException
119 * If an I/O error occurs
120 */
121 public CSVParser(final Reader input, final CSVFormat format) throws IOException {
122 this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
123 this.headerMap = initializeHeader(format);
124 }
125
126 /**
127 * Customized CSV parser using the given {@link CSVFormat}
128 *
129 * @param input
130 * a String containing "csv-formatted" input
131 * @param format
132 * the CSVFormat used for CSV parsing
133 * @throws IllegalArgumentException
134 * thrown if the parameters of the format are inconsistent
135 * @throws IOException
136 * If an I/O error occurs
137 */
138 public CSVParser(final String input, final CSVFormat format) throws IOException {
139 this(new StringReader(input), format);
140 }
141
142 /**
143 * Returns a copy of the header map that iterates in column order.
144 * <p>
145 * The map keys are column names.
146 * The map values are 0-based indices.
147 *
148 * @return a copy of the header map that iterates in column order.
149 */
150 public Map<String, Integer> getHeaderMap() {
151 return new LinkedHashMap<String, Integer>(headerMap);
152 }
153
154 /**
155 * Returns the current line number in the input stream.
156 * <p/>
157 * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the record number.
158 *
159 * @return current line number
160 */
161 public long getLineNumber() {
162 return lexer.getLineNumber();
163 }
164
165 /**
166 * Returns the current record number in the input stream.
167 * <p/>
168 * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the line number.
169 *
170 * @return current line number
171 */
172 public long getRecordNumber() {
173 return recordNumber;
174 }
175
176 /**
177 * Parses the next record from the current point in the stream.
178 *
179 * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
180 * @throws IOException
181 * on parse error or input read-failure
182 */
183 CSVRecord nextRecord() throws IOException {
184 CSVRecord result = null;
185 record.clear();
186 StringBuilder sb = null;
187 do {
188 reusableToken.reset();
189 lexer.nextToken(reusableToken);
190 switch (reusableToken.type) {
191 case TOKEN:
192 record.add(reusableToken.content.toString());
193 break;
194 case EORECORD:
195 record.add(reusableToken.content.toString());
196 break;
197 case EOF:
198 if (reusableToken.isReady) {
199 record.add(reusableToken.content.toString());
200 }
201 break;
202 case INVALID:
203 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
204 case COMMENT: // Ignored currently
205 if (sb == null) { // first comment for this record
206 sb = new StringBuilder();
207 } else {
208 sb.append("\n");
209 }
210 sb.append(reusableToken.content);
211 reusableToken.type = TOKEN; // Read another token
212 break;
213 }
214 } while (reusableToken.type == TOKEN);
215
216 if (!record.isEmpty()) {
217 recordNumber++;
218 final String comment = sb == null ? null : sb.toString();
219 result = new CSVRecord(record.toArray(new String[record.size()]), headerMap, comment, this.recordNumber);
220 }
221 return result;
222 }
223
224 /**
225 * Parses the CSV input according to the given format and returns the content as an array of {@link CSVRecord}
226 * entries.
227 * <p/>
228 * The returned content starts at the current parse-position in the stream.
229 *
230 * @return list of {@link CSVRecord} entries, may be empty
231 * @throws IOException
232 * on parse error or input read-failure
233 */
234 public List<CSVRecord> getRecords() throws IOException {
235 final List<CSVRecord> records = new ArrayList<CSVRecord>();
236 CSVRecord rec;
237 while ((rec = nextRecord()) != null) {
238 records.add(rec);
239 }
240 return records;
241 }
242
243 /**
244 * Initializes the name to index mapping if the format defines a header.
245 */
246 private Map<String, Integer> initializeHeader(final CSVFormat format) throws IOException {
247 Map<String, Integer> hdrMap = null;
248 if (format.getHeader() != null) {
249 hdrMap = new LinkedHashMap<String, Integer>();
250
251 String[] header = null;
252 if (format.getHeader().length == 0) {
253 // read the header from the first line of the file
254 final CSVRecord record = nextRecord();
255 if (record != null) {
256 header = record.values();
257 }
258 } else {
259 header = format.getHeader();
260 }
261
262 // build the name to index mappings
263 if (header != null) {
264 for (int i = 0; i < header.length; i++) {
265 hdrMap.put(header[i], Integer.valueOf(i));
266 }
267 }
268 }
269 return hdrMap;
270 }
271
272 /**
273 * Returns an iterator on the records. IOExceptions occurring during the iteration are wrapped in a
274 * RuntimeException.
275 */
276 public Iterator<CSVRecord> iterator() {
277 return new Iterator<CSVRecord>() {
278 private CSVRecord current;
279
280 private CSVRecord getNextRecord() {
281 try {
282 return nextRecord();
283 } catch (final IOException e) {
284 // TODO: This is not great, throw an ISE instead?
285 throw new RuntimeException(e);
286 }
287 }
288
289 public boolean hasNext() {
290 if (current == null) {
291 current = getNextRecord();
292 }
293
294 return current != null;
295 }
296
297 public CSVRecord next() {
298 CSVRecord next = current;
299 current = null;
300
301 if (next == null) {
302 // hasNext() wasn't called before
303 next = getNextRecord();
304 if (next == null) {
305 throw new NoSuchElementException("No more CSV records available");
306 }
307 }
308
309 return next;
310 }
311
312 public void remove() {
313 throw new UnsupportedOperationException();
314 }
315 };
316 }
317 }