1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.csv;
19
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.io.StringReader;
23 import java.util.ArrayList;
24 import java.util.Iterator;
25 import java.util.List;
26 import java.util.NoSuchElementException;
27
28 import org.apache.commons.csv.CSVLexer.Token;
29
30 import static org.apache.commons.csv.CSVLexer.Token.Type.*;
31
32 /**
33 * Parses CSV files according to the specified configuration.
34 *
35 * Because CSV appears in many different dialects, the parser supports many
36 * configuration settings by allowing the specification of a {@link CSVFormat}.
37 *
38 * <p>Parsing of a csv-string having tabs as separators,
39 * '"' as an optional value encapsulator, and comments starting with '#':</p>
40 * <pre>
41 * CSVFormat format = new CSVFormat('\t', '"', '#');
42 * Reader in = new StringReader("a\tb\nc\td");
43 * String[][] records = new CSVParser(in, format).getRecords();
44 * </pre>
45 *
46 * <p>Parsing of a csv-string in Excel CSV format, using a for-each loop:</p>
47 * <pre>
48 * Reader in = new StringReader("a;b\nc;d");
49 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
50 * for (String[] record : parser) {
51 * ...
52 * }
53 * </pre>
54 *
55 * <p>
56 * Internal parser state is completely covered by the format
57 * and the reader-state.</p>
58 *
59 * <p>see <a href="package-summary.html">package documentation</a>
60 * for more details</p>
61 */
62 public class CSVParser implements Iterable<String[]> {
63
64 /** Immutable empty String array. */
65 private static final String[] EMPTY_STRING_ARRAY = new String[0];
66
67 private final CSVLexer lexer;
68
69 // the following objects are shared to reduce garbage
70
71 /** A record buffer for getRecord(). Grows as necessary and is reused. */
72 private final List<String> record = new ArrayList<String>();
73 private final Token reusableToken = new Token();
74
75 /**
76 * CSV parser using the default {@link CSVFormat}.
77 *
78 * @param input a Reader containing "csv-formatted" input
79 * @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
80 */
81 public CSVParser(Reader input) {
82 this(input, CSVFormat.DEFAULT);
83 }
84
85 /**
86 * Customized CSV parser using the given {@link CSVFormat}
87 *
88 * @param input a Reader containing "csv-formatted" input
89 * @param format the CSVFormat used for CSV parsing
90 * @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
91 */
92 public CSVParser(Reader input, CSVFormat format) {
93 format.validate();
94
95 if (format.isUnicodeEscapesInterpreted()) {
96 input = new UnicodeUnescapeReader(input);
97 }
98
99 this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
100 }
101
102 /**
103 * Customized CSV parser using the given {@link CSVFormat}
104 *
105 * @param input a String containing "csv-formatted" input
106 * @param format the CSVFormat used for CSV parsing
107 * @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
108 */
109 public CSVParser(String input, CSVFormat format) {
110 this(new StringReader(input), format);
111 }
112
113
114 /**
115 * Parses the CSV input according to the given format and returns the content
116 * as an array of records (whereas records are arrays of single values).
117 * <p/>
118 * The returned content starts at the current parse-position in the stream.
119 *
120 * @return matrix of records x values ('null' when end of file)
121 * @throws IOException on parse error or input read-failure
122 */
123 public String[][] getRecords() throws IOException {
124 List<String[]> records = new ArrayList<String[]>();
125 String[] record;
126 while ((record = getRecord()) != null) {
127 records.add(record);
128 }
129
130 if (!records.isEmpty()) {
131 return records.toArray(new String[records.size()][]);
132 } else {
133 return null;
134 }
135 }
136
137 /**
138 * Parses the next record from the current point in the stream.
139 *
140 * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
141 * @throws IOException on parse error or input read-failure
142 */
143 String[] getRecord() throws IOException {
144 String[] result = EMPTY_STRING_ARRAY;
145 record.clear();
146 while (true) {
147 reusableToken.reset();
148 lexer.nextToken(reusableToken);
149 switch (reusableToken.type) {
150 case TOKEN:
151 record.add(reusableToken.content.toString());
152 break;
153 case EORECORD:
154 record.add(reusableToken.content.toString());
155 break;
156 case EOF:
157 if (reusableToken.isReady) {
158 record.add(reusableToken.content.toString());
159 } else {
160 result = null;
161 }
162 break;
163 case INVALID:
164 // error: throw IOException
165 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
166 // unreachable: break;
167 }
168 if (reusableToken.type != TOKEN) {
169 break;
170 }
171 }
172 if (!record.isEmpty()) {
173 result = record.toArray(new String[record.size()]);
174 }
175 return result;
176 }
177
178 /**
179 * Returns an iterator on the records. IOExceptions occuring
180 * during the iteration are wrapped in a RuntimeException.
181 */
182 public Iterator<String[]> iterator() {
183 return new Iterator<String[]>() {
184 String[] current;
185
186 public boolean hasNext() {
187 if (current == null) {
188 current = getNextLine();
189 }
190
191 return current != null;
192 }
193
194 public String[] next() {
195 String[] next = current;
196 current = null;
197
198 if (next == null) {
199 // hasNext() wasn't called before
200 next = getNextLine();
201 if (next == null) {
202 throw new NoSuchElementException("No more CSV records available");
203 }
204 }
205
206 return next;
207 }
208
209 private String[] getNextLine() {
210 try {
211 return getRecord();
212 } catch (IOException e) {
213 throw new RuntimeException(e);
214 }
215 }
216
217 public void remove() { }
218 };
219 }
220
221 /**
222 * Returns the current line number in the input stream.
223 * <p/>
224 * ATTENTION: in case your csv has multiline-values the returned
225 * number does not correspond to the record-number
226 *
227 * @return current line number
228 */
229 public int getLineNumber() {
230 return lexer.getLineNumber();
231 }
232 }
233
234
235 class CSVLexer {
236
237 /** length of the initial token (content-)buffer */
238 private static final int INITIAL_TOKEN_LENGTH = 50;
239
240 private final CharBuffer wsBuf = new CharBuffer();
241
242 private final CSVFormat format;
243
244 /** The input stream */
245 private final ExtendedBufferedReader in;
246
247 /**
248 * Token is an internal token representation.
249 * <p/>
250 * It is used as contract between the lexer and the parser.
251 */
252 static class Token {
253
254 enum Type {
255 /** Token has no valid content, i.e. is in its initialized state. */
256 INVALID,
257
258 /** Token with content, at beginning or in the middle of a line. */
259 TOKEN,
260
261 /** Token (which can have content) when end of file is reached. */
262 EOF,
263
264 /** Token with content when end of a line is reached. */
265 EORECORD
266 }
267
268 /** Token type */
269 Type type = INVALID;
270
271 /** The content buffer. */
272 CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
273
274 /** Token ready flag: indicates a valid token with content (ready for the parser). */
275 boolean isReady;
276
277 Token reset() {
278 content.clear();
279 type = INVALID;
280 isReady = false;
281 return this;
282 }
283 }
284
285 CSVLexer(CSVFormat format, ExtendedBufferedReader in) {
286 this.format = format;
287 this.in = in;
288 }
289
290 public int getLineNumber() {
291 return in.getLineNumber();
292 }
293
294 /**
295 * Returns the next token.
296 * <p/>
297 * A token corresponds to a term, a record change or an end-of-file indicator.
298 *
299 * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
300 * @return the next token found
301 * @throws IOException on stream access error
302 */
303 Token nextToken(Token tkn) throws IOException {
304 wsBuf.clear(); // reuse
305
306 // get the last read char (required for empty line detection)
307 int lastChar = in.readAgain();
308
309 // read the next char and set eol
310 /* note: unfortunately isEndOfLine may consumes a character silently.
311 * this has no effect outside of the method. so a simple workaround
312 * is to call 'readAgain' on the stream...
313 * uh: might using objects instead of base-types (jdk1.5 autoboxing!)
314 */
315 int c = in.read();
316 boolean eol = isEndOfLine(c);
317 c = in.readAgain();
318
319 // empty line detection: eol AND (last char was EOL or beginning)
320 while (format.isEmptyLinesIgnored() && eol
321 && (lastChar == '\n'
322 || lastChar == '\r'
323 || lastChar == ExtendedBufferedReader.UNDEFINED)
324 && !isEndOfFile(lastChar)) {
325 // go on char ahead ...
326 lastChar = c;
327 c = in.read();
328 eol = isEndOfLine(c);
329 c = in.readAgain();
330 // reached end of file without any content (empty line at the end)
331 if (isEndOfFile(c)) {
332 tkn.type = EOF;
333 return tkn;
334 }
335 }
336
337 // did we reach eof during the last iteration already ? EOF
338 if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) {
339 tkn.type = EOF;
340 return tkn;
341 }
342
343 // important: make sure a new char gets consumed in each iteration
344 while (!tkn.isReady && tkn.type != EOF) {
345 // ignore whitespaces at beginning of a token
346 while (format.isLeadingSpacesIgnored() && isWhitespace(c) && !eol) {
347 wsBuf.append((char) c);
348 c = in.read();
349 eol = isEndOfLine(c);
350 }
351 // ok, start of token reached: comment, encapsulated, or token
352 if (c == format.getCommentStart()) {
353 // ignore everything till end of line and continue (incr linecount)
354 in.readLine();
355 tkn = nextToken(tkn.reset());
356 } else if (c == format.getDelimiter()) {
357 // empty token return TOKEN("")
358 tkn.type = TOKEN;
359 tkn.isReady = true;
360 } else if (eol) {
361 // empty token return EORECORD("")
362 //noop: tkn.content.append("");
363 tkn.type = EORECORD;
364 tkn.isReady = true;
365 } else if (c == format.getEncapsulator()) {
366 // consume encapsulated token
367 encapsulatedTokenLexer(tkn, c);
368 } else if (isEndOfFile(c)) {
369 // end of file return EOF()
370 //noop: tkn.content.append("");
371 tkn.type = EOF;
372 tkn.isReady = true;
373 } else {
374 // next token must be a simple token
375 // add removed blanks when not ignoring whitespace chars...
376 if (!format.isLeadingSpacesIgnored()) {
377 tkn.content.append(wsBuf);
378 }
379 simpleTokenLexer(tkn, c);
380 }
381 }
382 return tkn;
383 }
384
385 /**
386 * A simple token lexer
387 * <p/>
388 * Simple token are tokens which are not surrounded by encapsulators.
389 * A simple token might contain escaped delimiters (as \, or \;). The
390 * token is finished when one of the following conditions become true:
391 * <ul>
392 * <li>end of line has been reached (EORECORD)</li>
393 * <li>end of stream has been reached (EOF)</li>
394 * <li>an unescaped delimiter has been reached (TOKEN)</li>
395 * </ul>
396 *
397 * @param tkn the current token
398 * @param c the current character
399 * @return the filled token
400 * @throws IOException on stream access error
401 */
402 private Token simpleTokenLexer(Token tkn, int c) throws IOException {
403 for (; ;) {
404 if (isEndOfLine(c)) {
405 // end of record
406 tkn.type = EORECORD;
407 tkn.isReady = true;
408 break;
409 } else if (isEndOfFile(c)) {
410 // end of file
411 tkn.type = EOF;
412 tkn.isReady = true;
413 break;
414 } else if (c == format.getDelimiter()) {
415 // end of token
416 tkn.type = TOKEN;
417 tkn.isReady = true;
418 break;
419 } else if (c == format.getEscape()) {
420 tkn.content.append((char) readEscape(c));
421 } else {
422 tkn.content.append((char) c);
423 }
424
425 c = in.read();
426 }
427
428 if (format.isTrailingSpacesIgnored()) {
429 tkn.content.trimTrailingWhitespace();
430 }
431
432 return tkn;
433 }
434
435
436 /**
437 * An encapsulated token lexer
438 * <p/>
439 * Encapsulated tokens are surrounded by the given encapsulating-string.
440 * The encapsulator itself might be included in the token using a
441 * doubling syntax (as "", '') or using escaping (as in \", \').
442 * Whitespaces before and after an encapsulated token are ignored.
443 *
444 * @param tkn the current token
445 * @param c the current character
446 * @return a valid token object
447 * @throws IOException on invalid state
448 */
449 private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
450 // save current line
451 int startLineNumber = getLineNumber();
452 // ignore the given delimiter
453 // assert c == delimiter;
454 for (; ;) {
455 c = in.read();
456
457 if (c == format.getEscape()) {
458 tkn.content.append((char) readEscape(c));
459 } else if (c == format.getEncapsulator()) {
460 if (in.lookAhead() == format.getEncapsulator()) {
461 // double or escaped encapsulator -> add single encapsulator to token
462 c = in.read();
463 tkn.content.append((char) c);
464 } else {
465 // token finish mark (encapsulator) reached: ignore whitespace till delimiter
466 for (; ;) {
467 c = in.read();
468 if (c == format.getDelimiter()) {
469 tkn.type = TOKEN;
470 tkn.isReady = true;
471 return tkn;
472 } else if (isEndOfFile(c)) {
473 tkn.type = EOF;
474 tkn.isReady = true;
475 return tkn;
476 } else if (isEndOfLine(c)) {
477 // ok eo token reached
478 tkn.type = EORECORD;
479 tkn.isReady = true;
480 return tkn;
481 } else if (!isWhitespace(c)) {
482 // error invalid char between token and next delimiter
483 throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
484 }
485 }
486 }
487 } else if (isEndOfFile(c)) {
488 // error condition (end of file before end of token)
489 throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
490 } else {
491 // consume character
492 tkn.content.append((char) c);
493 }
494 }
495 }
496
497 private int readEscape(int c) throws IOException {
498 // assume c is the escape char (normally a backslash)
499 c = in.read();
500 switch (c) {
501 case 'r':
502 return '\r';
503 case 'n':
504 return '\n';
505 case 't':
506 return '\t';
507 case 'b':
508 return '\b';
509 case 'f':
510 return '\f';
511 default:
512 return c;
513 }
514 }
515
516 /**
517 * @return true if the given char is a whitespace character
518 */
519 private boolean isWhitespace(int c) {
520 return Character.isWhitespace((char) c) && (c != format.getDelimiter());
521 }
522
523 /**
524 * Greedy - accepts \n, \r and \r\n
525 * This checker consumes silently the second control-character...
526 *
527 * @return true if the given character is a line-terminator
528 */
529 private boolean isEndOfLine(int c) throws IOException {
530 // check if we have \r\n...
531 if (c == '\r' && in.lookAhead() == '\n') {
532 // note: does not change c outside of this method !!
533 c = in.read();
534 }
535 return (c == '\n' || c == '\r');
536 }
537
538 /**
539 * @return true if the given character indicates end of file
540 */
541 private boolean isEndOfFile(int c) {
542 return c == ExtendedBufferedReader.END_OF_STREAM;
543 }
544 }