1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.csv;
18
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.io.InputStreamReader;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24
25
26 /**
27 * Parses CSV files according to the specified configuration.
28 *
29 * Because CSV appears in many different dialects, the parser supports many
30 * configuration settings by allowing the specification of a {@link CSVStrategy}.
31 *
32 * <p>Parsing of a csv-string having tabs as separators,
33 * '"' as an optional value encapsulator, and comments starting with '#':</p>
34 * <pre>
35 * String[][] data =
36 * (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
37 * </pre>
38 *
39 * <p>Parsing of a csv-string in Excel CSV format</p>
40 * <pre>
41 * String[][] data =
42 * (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
43 * </pre>
44 *
45 * <p>
46 * Internal parser state is completely covered by the strategy
47 * and the reader-state.</p>
48 *
49 * <p>see <a href="package-summary.html">package documentation</a>
50 * for more details</p>
51 */
52 public class CSVParser {
53
54 /** length of the initial token (content-)buffer */
55 private static final int INITIAL_TOKEN_LENGTH = 50;
56
57 // the token types
58 /** Token has no valid content, i.e. is in its initilized state. */
59 protected static final int TT_INVALID = -1;
60 /** Token with content, at beginning or in the middle of a line. */
61 protected static final int TT_TOKEN = 0;
62 /** Token (which can have content) when end of file is reached. */
63 protected static final int TT_EOF = 1;
64 /** Token with content when end of a line is reached. */
65 protected static final int TT_EORECORD = 2;
66
67 /** Immutable empty String array. */
68 private static final String[] EMPTY_STRING_ARRAY = new String[0];
69
70 // the input stream
71 private final ExtendedBufferedReader in;
72
73 // TODO: this can be made final if setStrategy is removed
74 private CSVStrategy strategy;
75
76 // the following objects are shared to reduce garbage
77 /** A record buffer for getLine(). Grows as necessary and is reused. */
78 private final ArrayList record = new ArrayList();
79 private final Token reusableToken = new Token();
80 private final CharBuffer wsBuf = new CharBuffer();
81 private final CharBuffer code = new CharBuffer(4);
82
83
84 /**
85 * Token is an internal token representation.
86 *
87 * It is used as contract between the lexer and the parser.
88 */
89 static class Token {
90 /** Token type, see TT_xxx constants. */
91 int type = TT_INVALID;
92 /** The content buffer. */
93 CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
94 /** Token ready flag: indicates a valid token with content (ready for the parser). */
95 boolean isReady;
96
97 Token reset() {
98 content.clear();
99 type = TT_INVALID;
100 isReady = false;
101 return this;
102 }
103 }
104
105 // ======================================================
106 // the constructor
107 // ======================================================
108
109 /**
110 * Default strategy for the parser follows the default {@link CSVStrategy}.
111 *
112 * @param input an InputStream containing "csv-formatted" stream
113 * @deprecated use {@link #CSVParser(Reader)}.
114 */
115 public CSVParser(InputStream input) {
116 this(new InputStreamReader(input));
117 }
118
119 /**
120 * CSV parser using the default {@link CSVStrategy}.
121 *
122 * @param input a Reader containing "csv-formatted" input
123 */
124 public CSVParser(Reader input) {
125 // note: must match default-CSV-strategy !!
126 this(input, ',');
127 }
128
129 /**
130 * Customized value delimiter parser.
131 *
132 * The parser follows the default {@link CSVStrategy}
133 * except for the delimiter setting.
134 *
135 * @param input a Reader based on "csv-formatted" input
136 * @param delimiter a Char used for value separation
137 * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
138 */
139 public CSVParser(Reader input, char delimiter) {
140 this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
141 }
142
143 /**
144 * Customized csv parser.
145 *
146 * The parser parses according to the given CSV dialect settings.
147 * Leading whitespaces are truncated, unicode escapes are
148 * not interpreted and empty lines are ignored.
149 *
150 * @param input a Reader based on "csv-formatted" input
151 * @param delimiter a Char used for value separation
152 * @param encapsulator a Char used as value encapsulation marker
153 * @param commentStart a Char used for comment identification
154 * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
155 */
156 public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
157 this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
158 }
159
160 /**
161 * Customized CSV parser using the given {@link CSVStrategy}
162 *
163 * @param input a Reader containing "csv-formatted" input
164 * @param strategy the CSVStrategy used for CSV parsing
165 */
166 public CSVParser(Reader input, CSVStrategy strategy) {
167 this.in = new ExtendedBufferedReader(input);
168 this.strategy = strategy;
169 }
170
171 // ======================================================
172 // the parser
173 // ======================================================
174
175 /**
176 * Parses the CSV according to the given strategy
177 * and returns the content as an array of records
178 * (whereas records are arrays of single values).
179 * <p>
180 * The returned content starts at the current parse-position in
181 * the stream.
182 *
183 * @return matrix of records x values ('null' when end of file)
184 * @throws IOException on parse error or input read-failure
185 */
186 public String[][] getAllValues() throws IOException {
187 ArrayList records = new ArrayList();
188 String[] values;
189 String[][] ret = null;
190 while ((values = getLine()) != null) {
191 records.add(values);
192 }
193 if (records.size() > 0) {
194 ret = new String[records.size()][];
195 records.toArray(ret);
196 }
197 return ret;
198 }
199
200 /**
201 * Parses the CSV according to the given strategy
202 * and returns the next csv-value as string.
203 *
204 * @return next value in the input stream ('null' when end of file)
205 * @throws IOException on parse error or input read-failure
206 */
207 public String nextValue() throws IOException {
208 Token tkn = nextToken();
209 String ret = null;
210 switch (tkn.type) {
211 case TT_TOKEN:
212 case TT_EORECORD:
213 ret = tkn.content.toString();
214 break;
215 case TT_EOF:
216 ret = null;
217 break;
218 case TT_INVALID:
219 default:
220 // error no token available (or error)
221 throw new IOException(
222 "(line " + getLineNumber()
223 + ") invalid parse sequence");
224 // unreachable: break;
225 }
226 return ret;
227 }
228
229 /**
230 * Parses from the current point in the stream til
231 * the end of the current line.
232 *
233 * @return array of values til end of line
234 * ('null' when end of file has been reached)
235 * @throws IOException on parse error or input read-failure
236 */
237 public String[] getLine() throws IOException {
238 String[] ret = EMPTY_STRING_ARRAY;
239 record.clear();
240 while (true) {
241 reusableToken.reset();
242 nextToken(reusableToken);
243 switch (reusableToken.type) {
244 case TT_TOKEN:
245 record.add(reusableToken.content.toString());
246 break;
247 case TT_EORECORD:
248 record.add(reusableToken.content.toString());
249 break;
250 case TT_EOF:
251 if (reusableToken.isReady) {
252 record.add(reusableToken.content.toString());
253 } else {
254 ret = null;
255 }
256 break;
257 case TT_INVALID:
258 default:
259 // error: throw IOException
260 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
261 // unreachable: break;
262 }
263 if (reusableToken.type != TT_TOKEN) {
264 break;
265 }
266 }
267 if (!record.isEmpty()) {
268 ret = (String[]) record.toArray(new String[record.size()]);
269 }
270 return ret;
271 }
272
273 /**
274 * Returns the current line number in the input stream.
275 *
276 * ATTENTION: in case your csv has multiline-values the returned
277 * number does not correspond to the record-number
278 *
279 * @return current line number
280 */
281 public int getLineNumber() {
282 return in.getLineNumber();
283 }
284
285 // ======================================================
286 // the lexer(s)
287 // ======================================================
288
289 /**
290 * Convenience method for <code>nextToken(null)</code>.
291 */
292 protected Token nextToken() throws IOException {
293 return nextToken(new Token());
294 }
295
296 /**
297 * Returns the next token.
298 *
299 * A token corresponds to a term, a record change or an
300 * end-of-file indicator.
301 *
302 * @param tkn an existing Token object to reuse. The caller is responsible to initialize the
303 * Token.
304 * @return the next token found
305 * @throws IOException on stream access error
306 */
307 protected Token nextToken(Token tkn) throws IOException {
308 wsBuf.clear(); // resuse
309
310 // get the last read char (required for empty line detection)
311 int lastChar = in.readAgain();
312
313 // read the next char and set eol
314 /* note: unfourtunately isEndOfLine may consumes a character silently.
315 * this has no effect outside of the method. so a simple workaround
316 * is to call 'readAgain' on the stream...
317 * uh: might using objects instead of base-types (jdk1.5 autoboxing!)
318 */
319 int c = in.read();
320 boolean eol = isEndOfLine(c);
321 c = in.readAgain();
322
323 // empty line detection: eol AND (last char was EOL or beginning)
324 while (strategy.getIgnoreEmptyLines() && eol
325 && (lastChar == '\n'
326 || lastChar == ExtendedBufferedReader.UNDEFINED)
327 && !isEndOfFile(lastChar)) {
328 // go on char ahead ...
329 lastChar = c;
330 c = in.read();
331 eol = isEndOfLine(c);
332 c = in.readAgain();
333 // reached end of file without any content (empty line at the end)
334 if (isEndOfFile(c)) {
335 tkn.type = TT_EOF;
336 return tkn;
337 }
338 }
339
340 // did we reached eof during the last iteration already ? TT_EOF
341 if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
342 tkn.type = TT_EOF;
343 return tkn;
344 }
345
346 // important: make sure a new char gets consumed in each iteration
347 while (!tkn.isReady) {
348 // ignore whitespaces at beginning of a token
349 while (isWhitespace(c) && !eol) {
350 wsBuf.append((char) c);
351 c = in.read();
352 eol = isEndOfLine(c);
353 }
354 // ok, start of token reached: comment, encapsulated, or token
355 if (c == strategy.getCommentStart()) {
356 // ignore everything till end of line and continue (incr linecount)
357 in.readLine();
358 tkn = nextToken(tkn.reset());
359 } else if (c == strategy.getDelimiter()) {
360 // empty token return TT_TOKEN("")
361 tkn.type = TT_TOKEN;
362 tkn.isReady = true;
363 } else if (eol) {
364 // empty token return TT_EORECORD("")
365 //noop: tkn.content.append("");
366 tkn.type = TT_EORECORD;
367 tkn.isReady = true;
368 } else if (c == strategy.getEncapsulator()) {
369 // consume encapsulated token
370 encapsulatedTokenLexer(tkn, c);
371 } else if (isEndOfFile(c)) {
372 // end of file return TT_EOF()
373 //noop: tkn.content.append("");
374 tkn.type = TT_EOF;
375 tkn.isReady = true;
376 } else {
377 // next token must be a simple token
378 // add removed blanks when not ignoring whitespace chars...
379 if (!strategy.getIgnoreLeadingWhitespaces()) {
380 tkn.content.append(wsBuf);
381 }
382 simpleTokenLexer(tkn, c);
383 }
384 }
385 return tkn;
386 }
387
388 /**
389 * A simple token lexer
390 *
391 * Simple token are tokens which are not surrounded by encapsulators.
392 * A simple token might contain escaped delimiters (as \, or \;). The
393 * token is finished when one of the following conditions become true:
394 * <ul>
395 * <li>end of line has been reached (TT_EORECORD)</li>
396 * <li>end of stream has been reached (TT_EOF)</li>
397 * <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
398 * </ul>
399 *
400 * @param tkn the current token
401 * @param c the current character
402 * @return the filled token
403 *
404 * @throws IOException on stream access error
405 */
406 private Token simpleTokenLexer(Token tkn, int c) throws IOException {
407 for (;;) {
408 if (isEndOfLine(c)) {
409 // end of record
410 tkn.type = TT_EORECORD;
411 tkn.isReady = true;
412 break;
413 } else if (isEndOfFile(c)) {
414 // end of file
415 tkn.type = TT_EOF;
416 tkn.isReady = true;
417 break;
418 } else if (c == strategy.getDelimiter()) {
419 // end of token
420 tkn.type = TT_TOKEN;
421 tkn.isReady = true;
422 break;
423 } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
424 // interpret unicode escaped chars (like \u0070 -> p)
425 tkn.content.append((char) unicodeEscapeLexer(c));
426 } else if (c == strategy.getEscape()) {
427 tkn.content.append((char)readEscape(c));
428 } else {
429 tkn.content.append((char) c);
430 }
431
432 c = in.read();
433 }
434
435 if (strategy.getIgnoreTrailingWhitespaces()) {
436 tkn.content.trimTrailingWhitespace();
437 }
438
439 return tkn;
440 }
441
442
443 /**
444 * An encapsulated token lexer
445 *
446 * Encapsulated tokens are surrounded by the given encapsulating-string.
447 * The encapsulator itself might be included in the token using a
448 * doubling syntax (as "", '') or using escaping (as in \", \').
449 * Whitespaces before and after an encapsulated token are ignored.
450 *
451 * @param tkn the current token
452 * @param c the current character
453 * @return a valid token object
454 * @throws IOException on invalid state
455 */
456 private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
457 // save current line
458 int startLineNumber = getLineNumber();
459 // ignore the given delimiter
460 // assert c == delimiter;
461 for (;;) {
462 c = in.read();
463
464 if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
465 tkn.content.append((char) unicodeEscapeLexer(c));
466 } else if (c == strategy.getEscape()) {
467 tkn.content.append((char)readEscape(c));
468 } else if (c == strategy.getEncapsulator()) {
469 if (in.lookAhead() == strategy.getEncapsulator()) {
470 // double or escaped encapsulator -> add single encapsulator to token
471 c = in.read();
472 tkn.content.append((char) c);
473 } else {
474 // token finish mark (encapsulator) reached: ignore whitespace till delimiter
475 for (;;) {
476 c = in.read();
477 if (c == strategy.getDelimiter()) {
478 tkn.type = TT_TOKEN;
479 tkn.isReady = true;
480 return tkn;
481 } else if (isEndOfFile(c)) {
482 tkn.type = TT_EOF;
483 tkn.isReady = true;
484 return tkn;
485 } else if (isEndOfLine(c)) {
486 // ok eo token reached
487 tkn.type = TT_EORECORD;
488 tkn.isReady = true;
489 return tkn;
490 } else if (!isWhitespace(c)) {
491 // error invalid char between token and next delimiter
492 throw new IOException(
493 "(line " + getLineNumber()
494 + ") invalid char between encapsulated token end delimiter"
495 );
496 }
497 }
498 }
499 } else if (isEndOfFile(c)) {
500 // error condition (end of file before end of token)
501 throw new IOException(
502 "(startline " + startLineNumber + ")"
503 + "eof reached before encapsulated token finished"
504 );
505 } else {
506 // consume character
507 tkn.content.append((char) c);
508 }
509 }
510 }
511
512
513 /**
514 * Decodes Unicode escapes.
515 *
516 * Interpretation of "\\uXXXX" escape sequences
517 * where XXXX is a hex-number.
518 * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
519 * @return the decoded character
520 * @throws IOException on wrong unicode escape sequence or read error
521 */
522 protected int unicodeEscapeLexer(int c) throws IOException {
523 int ret = 0;
524 // ignore 'u' (assume c==\ now) and read 4 hex digits
525 c = in.read();
526 code.clear();
527 try {
528 for (int i = 0; i < 4; i++) {
529 c = in.read();
530 if (isEndOfFile(c) || isEndOfLine(c)) {
531 throw new NumberFormatException("number too short");
532 }
533 code.append((char) c);
534 }
535 ret = Integer.parseInt(code.toString(), 16);
536 } catch (NumberFormatException e) {
537 throw new IOException(
538 "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
539 + code.toString() + "'" + e.toString());
540 }
541 return ret;
542 }
543
544 private int readEscape(int c) throws IOException {
545 // assume c is the escape char (normally a backslash)
546 c = in.read();
547 int out;
548 switch (c) {
549 case 'r': out='\r'; break;
550 case 'n': out='\n'; break;
551 case 't': out='\t'; break;
552 case 'b': out='\b'; break;
553 case 'f': out='\f'; break;
554 default : out=c;
555 }
556 return out;
557 }
558
559 // ======================================================
560 // strategies
561 // ======================================================
562
563 /**
564 * Sets the specified CSV Strategy
565 *
566 * @return current instance of CSVParser to allow chained method calls
567 * @deprecated the strategy should be set in the constructor {@link #CSVParser(Reader,CSVStrategy)}.
568 */
569 public CSVParser setStrategy(CSVStrategy strategy) {
570 this.strategy = strategy;
571 return this;
572 }
573
574 /**
575 * Obtain the specified CSV Strategy
576 *
577 * @return strategy currently being used
578 */
579 public CSVStrategy getStrategy() {
580 return this.strategy;
581 }
582
583 // ======================================================
584 // Character class checker
585 // ======================================================
586
587 /**
588 * @return true if the given char is a whitespace character
589 */
590 private boolean isWhitespace(int c) {
591 return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
592 }
593
594 /**
595 * Greedy - accepts \n and \r\n
596 * This checker consumes silently the second control-character...
597 *
598 * @return true if the given character is a line-terminator
599 */
600 private boolean isEndOfLine(int c) throws IOException {
601 // check if we have \r\n...
602 if (c == '\r') {
603 if (in.lookAhead() == '\n') {
604 // note: does not change c outside of this method !!
605 c = in.read();
606 }
607 }
608 return (c == '\n');
609 }
610
611 /**
612 * @return true if the given character indicates end of file
613 */
614 private boolean isEndOfFile(int c) {
615 return c == ExtendedBufferedReader.END_OF_STREAM;
616 }
617 }