1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.lang3.text;
18
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.NoSuchElementException;
24
25 import org.apache.commons.lang3.ArrayUtils;
26 import org.apache.commons.lang3.StringUtils;
27
28 /**
29 * Tokenizes a string based based on delimiters (separators)
30 * and supporting quoting and ignored character concepts.
31 * <p>
32 * This class can split a String into many smaller strings. It aims
33 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34 * however it offers much more control and flexibility including implementing
35 * the <code>ListIterator</code> interface. By default, it is set up
36 * like <code>StringTokenizer</code>.
37 * <p>
38 * The input String is split into a number of <i>tokens</i>.
39 * Each token is separated from the next String by a <i>delimiter</i>.
40 * One or more delimiter characters must be specified.
41 * <p>
42 * Each token may be surrounded by quotes.
43 * The <i>quote</i> matcher specifies the quote character(s).
44 * A quote may be escaped within a quoted section by duplicating itself.
45 * <p>
46 * Between each token and the delimiter are potentially characters that need trimming.
47 * The <i>trimmer</i> matcher specifies these characters.
48 * One usage might be to trim whitespace characters.
49 * <p>
50 * At any point outside the quotes there might potentially be invalid characters.
51 * The <i>ignored</i> matcher specifies these characters to be removed.
52 * One usage might be to remove new line characters.
53 * <p>
54 * Empty tokens may be removed or returned as null.
55 * <pre>
56 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
57 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
58 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59 * </pre>
60 * <p>
61 *
62 * This tokenizer has the following properties and options:
63 *
64 * <table>
65 * <tr>
66 * <th>Property</th><th>Type</th><th>Default</th>
67 * </tr>
68 * <tr>
69 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
70 * </tr>
71 * <tr>
72 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
73 * </tr>
74 * <tr>
75 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
76 * </tr>
77 * <tr>
78 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
79 * </tr>
80 * <tr>
81 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
82 * </tr>
83 * </table>
84 *
85 * @since 2.2
86 * @version $Id: StrTokenizer.java 1436770 2013-01-22 07:09:45Z ggregory $
87 */
88 public class StrTokenizer implements ListIterator<String>, Cloneable {
89
90 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
91 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
92 static {
93 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
94 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
95 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
96 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
97 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
98 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
99 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100
101 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108 }
109
110 /** The text to work on. */
111 private char chars[];
112 /** The parsed tokens */
113 private String tokens[];
114 /** The current iteration position */
115 private int tokenPos;
116
117 /** The delimiter matcher */
118 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119 /** The quote matcher */
120 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121 /** The ignored matcher */
122 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123 /** The trimmer matcher */
124 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125
126 /** Whether to return empty tokens as null */
127 private boolean emptyAsNull = false;
128 /** Whether to ignore empty tokens */
129 private boolean ignoreEmptyTokens = true;
130
131 //-----------------------------------------------------------------------
132
133 /**
134 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135 *
136 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137 */
138 private static StrTokenizer getCSVClone() {
139 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140 }
141
142 /**
143 * Gets a new tokenizer instance which parses Comma Separated Value strings
144 * initializing it with the given input. The default for CSV processing
145 * will be trim whitespace from both ends (which can be overridden with
146 * the setTrimmer method).
147 * <p>
148 * You must call a "reset" method to set the string which you want to parse.
149 * @return a new tokenizer instance which parses Comma Separated Value strings
150 */
151 public static StrTokenizer getCSVInstance() {
152 return getCSVClone();
153 }
154
155 /**
156 * Gets a new tokenizer instance which parses Comma Separated Value strings
157 * initializing it with the given input. The default for CSV processing
158 * will be trim whitespace from both ends (which can be overridden with
159 * the setTrimmer method).
160 *
161 * @param input the text to parse
162 * @return a new tokenizer instance which parses Comma Separated Value strings
163 */
164 public static StrTokenizer getCSVInstance(final String input) {
165 final StrTokenizer tok = getCSVClone();
166 tok.reset(input);
167 return tok;
168 }
169
170 /**
171 * Gets a new tokenizer instance which parses Comma Separated Value strings
172 * initializing it with the given input. The default for CSV processing
173 * will be trim whitespace from both ends (which can be overridden with
174 * the setTrimmer method).
175 *
176 * @param input the text to parse
177 * @return a new tokenizer instance which parses Comma Separated Value strings
178 */
179 public static StrTokenizer getCSVInstance(final char[] input) {
180 final StrTokenizer tok = getCSVClone();
181 tok.reset(input);
182 return tok;
183 }
184
185 /**
186 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187 *
188 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189 */
190 private static StrTokenizer getTSVClone() {
191 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192 }
193
194
195 /**
196 * Gets a new tokenizer instance which parses Tab Separated Value strings.
197 * The default for CSV processing will be trim whitespace from both ends
198 * (which can be overridden with the setTrimmer method).
199 * <p>
200 * You must call a "reset" method to set the string which you want to parse.
201 * @return a new tokenizer instance which parses Tab Separated Value strings.
202 */
203 public static StrTokenizer getTSVInstance() {
204 return getTSVClone();
205 }
206
207 /**
208 * Gets a new tokenizer instance which parses Tab Separated Value strings.
209 * The default for CSV processing will be trim whitespace from both ends
210 * (which can be overridden with the setTrimmer method).
211 * @param input the string to parse
212 * @return a new tokenizer instance which parses Tab Separated Value strings.
213 */
214 public static StrTokenizer getTSVInstance(final String input) {
215 final StrTokenizer tok = getTSVClone();
216 tok.reset(input);
217 return tok;
218 }
219
220 /**
221 * Gets a new tokenizer instance which parses Tab Separated Value strings.
222 * The default for CSV processing will be trim whitespace from both ends
223 * (which can be overridden with the setTrimmer method).
224 * @param input the string to parse
225 * @return a new tokenizer instance which parses Tab Separated Value strings.
226 */
227 public static StrTokenizer getTSVInstance(final char[] input) {
228 final StrTokenizer tok = getTSVClone();
229 tok.reset(input);
230 return tok;
231 }
232
233 //-----------------------------------------------------------------------
234 /**
235 * Constructs a tokenizer splitting on space, tab, newline and formfeed
236 * as per StringTokenizer, but with no text to tokenize.
237 * <p>
238 * This constructor is normally used with {@link #reset(String)}.
239 */
240 public StrTokenizer() {
241 super();
242 this.chars = null;
243 }
244
245 /**
246 * Constructs a tokenizer splitting on space, tab, newline and formfeed
247 * as per StringTokenizer.
248 *
249 * @param input the string which is to be parsed
250 */
251 public StrTokenizer(final String input) {
252 super();
253 if (input != null) {
254 chars = input.toCharArray();
255 } else {
256 chars = null;
257 }
258 }
259
260 /**
261 * Constructs a tokenizer splitting on the specified delimiter character.
262 *
263 * @param input the string which is to be parsed
264 * @param delim the field delimiter character
265 */
266 public StrTokenizer(final String input, final char delim) {
267 this(input);
268 setDelimiterChar(delim);
269 }
270
271 /**
272 * Constructs a tokenizer splitting on the specified delimiter string.
273 *
274 * @param input the string which is to be parsed
275 * @param delim the field delimiter string
276 */
277 public StrTokenizer(final String input, final String delim) {
278 this(input);
279 setDelimiterString(delim);
280 }
281
282 /**
283 * Constructs a tokenizer splitting using the specified delimiter matcher.
284 *
285 * @param input the string which is to be parsed
286 * @param delim the field delimiter matcher
287 */
288 public StrTokenizer(final String input, final StrMatcher delim) {
289 this(input);
290 setDelimiterMatcher(delim);
291 }
292
293 /**
294 * Constructs a tokenizer splitting on the specified delimiter character
295 * and handling quotes using the specified quote character.
296 *
297 * @param input the string which is to be parsed
298 * @param delim the field delimiter character
299 * @param quote the field quoted string character
300 */
301 public StrTokenizer(final String input, final char delim, final char quote) {
302 this(input, delim);
303 setQuoteChar(quote);
304 }
305
306 /**
307 * Constructs a tokenizer splitting using the specified delimiter matcher
308 * and handling quotes using the specified quote matcher.
309 *
310 * @param input the string which is to be parsed
311 * @param delim the field delimiter matcher
312 * @param quote the field quoted string matcher
313 */
314 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
315 this(input, delim);
316 setQuoteMatcher(quote);
317 }
318
319 /**
320 * Constructs a tokenizer splitting on space, tab, newline and formfeed
321 * as per StringTokenizer.
322 *
323 * @param input the string which is to be parsed, not cloned
324 */
325 public StrTokenizer(final char[] input) {
326 super();
327 this.chars = ArrayUtils.clone(input);
328 }
329
330 /**
331 * Constructs a tokenizer splitting on the specified character.
332 *
333 * @param input the string which is to be parsed, not cloned
334 * @param delim the field delimiter character
335 */
336 public StrTokenizer(final char[] input, final char delim) {
337 this(input);
338 setDelimiterChar(delim);
339 }
340
341 /**
342 * Constructs a tokenizer splitting on the specified string.
343 *
344 * @param input the string which is to be parsed, not cloned
345 * @param delim the field delimiter string
346 */
347 public StrTokenizer(final char[] input, final String delim) {
348 this(input);
349 setDelimiterString(delim);
350 }
351
352 /**
353 * Constructs a tokenizer splitting using the specified delimiter matcher.
354 *
355 * @param input the string which is to be parsed, not cloned
356 * @param delim the field delimiter matcher
357 */
358 public StrTokenizer(final char[] input, final StrMatcher delim) {
359 this(input);
360 setDelimiterMatcher(delim);
361 }
362
363 /**
364 * Constructs a tokenizer splitting on the specified delimiter character
365 * and handling quotes using the specified quote character.
366 *
367 * @param input the string which is to be parsed, not cloned
368 * @param delim the field delimiter character
369 * @param quote the field quoted string character
370 */
371 public StrTokenizer(final char[] input, final char delim, final char quote) {
372 this(input, delim);
373 setQuoteChar(quote);
374 }
375
376 /**
377 * Constructs a tokenizer splitting using the specified delimiter matcher
378 * and handling quotes using the specified quote matcher.
379 *
380 * @param input the string which is to be parsed, not cloned
381 * @param delim the field delimiter character
382 * @param quote the field quoted string character
383 */
384 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
385 this(input, delim);
386 setQuoteMatcher(quote);
387 }
388
389 // API
390 //-----------------------------------------------------------------------
391 /**
392 * Gets the number of tokens found in the String.
393 *
394 * @return the number of matched tokens
395 */
396 public int size() {
397 checkTokenized();
398 return tokens.length;
399 }
400
401 /**
402 * Gets the next token from the String.
403 * Equivalent to {@link #next()} except it returns null rather than
404 * throwing {@link NoSuchElementException} when no tokens remain.
405 *
406 * @return the next sequential token, or null when no more tokens are found
407 */
408 public String nextToken() {
409 if (hasNext()) {
410 return tokens[tokenPos++];
411 }
412 return null;
413 }
414
415 /**
416 * Gets the previous token from the String.
417 *
418 * @return the previous sequential token, or null when no more tokens are found
419 */
420 public String previousToken() {
421 if (hasPrevious()) {
422 return tokens[--tokenPos];
423 }
424 return null;
425 }
426
427 /**
428 * Gets a copy of the full token list as an independent modifiable array.
429 *
430 * @return the tokens as a String array
431 */
432 public String[] getTokenArray() {
433 checkTokenized();
434 return tokens.clone();
435 }
436
437 /**
438 * Gets a copy of the full token list as an independent modifiable list.
439 *
440 * @return the tokens as a String array
441 */
442 public List<String> getTokenList() {
443 checkTokenized();
444 final List<String> list = new ArrayList<String>(tokens.length);
445 for (final String element : tokens) {
446 list.add(element);
447 }
448 return list;
449 }
450
451 /**
452 * Resets this tokenizer, forgetting all parsing and iteration already completed.
453 * <p>
454 * This method allows the same tokenizer to be reused for the same String.
455 *
456 * @return this, to enable chaining
457 */
458 public StrTokenizer reset() {
459 tokenPos = 0;
460 tokens = null;
461 return this;
462 }
463
464 /**
465 * Reset this tokenizer, giving it a new input string to parse.
466 * In this manner you can re-use a tokenizer with the same settings
467 * on multiple input lines.
468 *
469 * @param input the new string to tokenize, null sets no text to parse
470 * @return this, to enable chaining
471 */
472 public StrTokenizer reset(final String input) {
473 reset();
474 if (input != null) {
475 this.chars = input.toCharArray();
476 } else {
477 this.chars = null;
478 }
479 return this;
480 }
481
482 /**
483 * Reset this tokenizer, giving it a new input string to parse.
484 * In this manner you can re-use a tokenizer with the same settings
485 * on multiple input lines.
486 *
487 * @param input the new character array to tokenize, not cloned, null sets no text to parse
488 * @return this, to enable chaining
489 */
490 public StrTokenizer reset(final char[] input) {
491 reset();
492 this.chars = ArrayUtils.clone(input);
493 return this;
494 }
495
496 // ListIterator
497 //-----------------------------------------------------------------------
498 /**
499 * Checks whether there are any more tokens.
500 *
501 * @return true if there are more tokens
502 */
503 @Override
504 public boolean hasNext() {
505 checkTokenized();
506 return tokenPos < tokens.length;
507 }
508
509 /**
510 * Gets the next token.
511 *
512 * @return the next String token
513 * @throws NoSuchElementException if there are no more elements
514 */
515 @Override
516 public String next() {
517 if (hasNext()) {
518 return tokens[tokenPos++];
519 }
520 throw new NoSuchElementException();
521 }
522
523 /**
524 * Gets the index of the next token to return.
525 *
526 * @return the next token index
527 */
528 @Override
529 public int nextIndex() {
530 return tokenPos;
531 }
532
533 /**
534 * Checks whether there are any previous tokens that can be iterated to.
535 *
536 * @return true if there are previous tokens
537 */
538 @Override
539 public boolean hasPrevious() {
540 checkTokenized();
541 return tokenPos > 0;
542 }
543
544 /**
545 * Gets the token previous to the last returned token.
546 *
547 * @return the previous token
548 */
549 @Override
550 public String previous() {
551 if (hasPrevious()) {
552 return tokens[--tokenPos];
553 }
554 throw new NoSuchElementException();
555 }
556
557 /**
558 * Gets the index of the previous token.
559 *
560 * @return the previous token index
561 */
562 @Override
563 public int previousIndex() {
564 return tokenPos - 1;
565 }
566
567 /**
568 * Unsupported ListIterator operation.
569 *
570 * @throws UnsupportedOperationException always
571 */
572 @Override
573 public void remove() {
574 throw new UnsupportedOperationException("remove() is unsupported");
575 }
576
577 /**
578 * Unsupported ListIterator operation.
579 * @param obj this parameter ignored.
580 * @throws UnsupportedOperationException always
581 */
582 @Override
583 public void set(final String obj) {
584 throw new UnsupportedOperationException("set() is unsupported");
585 }
586
587 /**
588 * Unsupported ListIterator operation.
589 * @param obj this parameter ignored.
590 * @throws UnsupportedOperationException always
591 */
592 @Override
593 public void add(final String obj) {
594 throw new UnsupportedOperationException("add() is unsupported");
595 }
596
597 // Implementation
598 //-----------------------------------------------------------------------
599 /**
600 * Checks if tokenization has been done, and if not then do it.
601 */
602 private void checkTokenized() {
603 if (tokens == null) {
604 if (chars == null) {
605 // still call tokenize as subclass may do some work
606 final List<String> split = tokenize(null, 0, 0);
607 tokens = split.toArray(new String[split.size()]);
608 } else {
609 final List<String> split = tokenize(chars, 0, chars.length);
610 tokens = split.toArray(new String[split.size()]);
611 }
612 }
613 }
614
615 /**
616 * Internal method to performs the tokenization.
617 * <p>
618 * Most users of this class do not need to call this method. This method
619 * will be called automatically by other (public) methods when required.
620 * <p>
621 * This method exists to allow subclasses to add code before or after the
622 * tokenization. For example, a subclass could alter the character array,
623 * offset or count to be parsed, or call the tokenizer multiple times on
624 * multiple strings. It is also be possible to filter the results.
625 * <p>
626 * <code>StrTokenizer</code> will always pass a zero offset and a count
627 * equal to the length of the array to this method, however a subclass
628 * may pass other values, or even an entirely different array.
629 *
630 * @param chars the character array being tokenized, may be null
631 * @param offset the start position within the character array, must be valid
632 * @param count the number of characters to tokenize, must be valid
633 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
634 */
635 protected List<String> tokenize(final char[] chars, final int offset, final int count) {
636 if (chars == null || count == 0) {
637 return Collections.emptyList();
638 }
639 final StrBuilder buf = new StrBuilder();
640 final List<String> tokens = new ArrayList<String>();
641 int pos = offset;
642
643 // loop around the entire buffer
644 while (pos >= 0 && pos < count) {
645 // find next token
646 pos = readNextToken(chars, pos, count, buf, tokens);
647
648 // handle case where end of string is a delimiter
649 if (pos >= count) {
650 addToken(tokens, "");
651 }
652 }
653 return tokens;
654 }
655
656 /**
657 * Adds a token to a list, paying attention to the parameters we've set.
658 *
659 * @param list the list to add to
660 * @param tok the token to add
661 */
662 private void addToken(final List<String> list, String tok) {
663 if (StringUtils.isEmpty(tok)) {
664 if (isIgnoreEmptyTokens()) {
665 return;
666 }
667 if (isEmptyTokenAsNull()) {
668 tok = null;
669 }
670 }
671 list.add(tok);
672 }
673
674 /**
675 * Reads character by character through the String to get the next token.
676 *
677 * @param chars the character array being tokenized
678 * @param start the first character of field
679 * @param len the length of the character array being tokenized
680 * @param workArea a temporary work area
681 * @param tokens the list of parsed tokens
682 * @return the starting position of the next field (the character
683 * immediately after the delimiter), or -1 if end of string found
684 */
685 private int readNextToken(final char[] chars, int start, final int len, final StrBuilder workArea, final List<String> tokens) {
686 // skip all leading whitespace, unless it is the
687 // field delimiter or the quote character
688 while (start < len) {
689 final int removeLen = Math.max(
690 getIgnoredMatcher().isMatch(chars, start, start, len),
691 getTrimmerMatcher().isMatch(chars, start, start, len));
692 if (removeLen == 0 ||
693 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
694 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
695 break;
696 }
697 start += removeLen;
698 }
699
700 // handle reaching end
701 if (start >= len) {
702 addToken(tokens, "");
703 return -1;
704 }
705
706 // handle empty token
707 final int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
708 if (delimLen > 0) {
709 addToken(tokens, "");
710 return start + delimLen;
711 }
712
713 // handle found token
714 final int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
715 if (quoteLen > 0) {
716 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
717 }
718 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
719 }
720
721 /**
722 * Reads a possibly quoted string token.
723 *
724 * @param chars the character array being tokenized
725 * @param start the first character of field
726 * @param len the length of the character array being tokenized
727 * @param workArea a temporary work area
728 * @param tokens the list of parsed tokens
729 * @param quoteStart the start position of the matched quote, 0 if no quoting
730 * @param quoteLen the length of the matched quote, 0 if no quoting
731 * @return the starting position of the next field (the character
732 * immediately after the delimiter, or if end of string found,
733 * then the length of string
734 */
735 private int readWithQuotes(final char[] chars, final int start, final int len, final StrBuilder workArea,
736 final List<String> tokens, final int quoteStart, final int quoteLen) {
737 // Loop until we've found the end of the quoted
738 // string or the end of the input
739 workArea.clear();
740 int pos = start;
741 boolean quoting = quoteLen > 0;
742 int trimStart = 0;
743
744 while (pos < len) {
745 // quoting mode can occur several times throughout a string
746 // we must switch between quoting and non-quoting until we
747 // encounter a non-quoted delimiter, or end of string
748 if (quoting) {
749 // In quoting mode
750
751 // If we've found a quote character, see if it's
752 // followed by a second quote. If so, then we need
753 // to actually put the quote character into the token
754 // rather than end the token.
755 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
756 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
757 // matched pair of quotes, thus an escaped quote
758 workArea.append(chars, pos, quoteLen);
759 pos += quoteLen * 2;
760 trimStart = workArea.size();
761 continue;
762 }
763
764 // end of quoting
765 quoting = false;
766 pos += quoteLen;
767 continue;
768 }
769
770 // copy regular character from inside quotes
771 workArea.append(chars[pos++]);
772 trimStart = workArea.size();
773
774 } else {
775 // Not in quoting mode
776
777 // check for delimiter, and thus end of token
778 final int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
779 if (delimLen > 0) {
780 // return condition when end of token found
781 addToken(tokens, workArea.substring(0, trimStart));
782 return pos + delimLen;
783 }
784
785 // check for quote, and thus back into quoting mode
786 if (quoteLen > 0 && isQuote(chars, pos, len, quoteStart, quoteLen)) {
787 quoting = true;
788 pos += quoteLen;
789 continue;
790 }
791
792 // check for ignored (outside quotes), and ignore
793 final int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
794 if (ignoredLen > 0) {
795 pos += ignoredLen;
796 continue;
797 }
798
799 // check for trimmed character
800 // don't yet know if its at the end, so copy to workArea
801 // use trimStart to keep track of trim at the end
802 final int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
803 if (trimmedLen > 0) {
804 workArea.append(chars, pos, trimmedLen);
805 pos += trimmedLen;
806 continue;
807 }
808
809 // copy regular character from outside quotes
810 workArea.append(chars[pos++]);
811 trimStart = workArea.size();
812 }
813 }
814
815 // return condition when end of string found
816 addToken(tokens, workArea.substring(0, trimStart));
817 return -1;
818 }
819
820 /**
821 * Checks if the characters at the index specified match the quote
822 * already matched in readNextToken().
823 *
824 * @param chars the character array being tokenized
825 * @param pos the position to check for a quote
826 * @param len the length of the character array being tokenized
827 * @param quoteStart the start position of the matched quote, 0 if no quoting
828 * @param quoteLen the length of the matched quote, 0 if no quoting
829 * @return true if a quote is matched
830 */
831 private boolean isQuote(final char[] chars, final int pos, final int len, final int quoteStart, final int quoteLen) {
832 for (int i = 0; i < quoteLen; i++) {
833 if (pos + i >= len || chars[pos + i] != chars[quoteStart + i]) {
834 return false;
835 }
836 }
837 return true;
838 }
839
840 // Delimiter
841 //-----------------------------------------------------------------------
842 /**
843 * Gets the field delimiter matcher.
844 *
845 * @return the delimiter matcher in use
846 */
847 public StrMatcher getDelimiterMatcher() {
848 return this.delimMatcher;
849 }
850
851 /**
852 * Sets the field delimiter matcher.
853 * <p>
854 * The delimitier is used to separate one token from another.
855 *
856 * @param delim the delimiter matcher to use
857 * @return this, to enable chaining
858 */
859 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
860 if (delim == null) {
861 this.delimMatcher = StrMatcher.noneMatcher();
862 } else {
863 this.delimMatcher = delim;
864 }
865 return this;
866 }
867
868 /**
869 * Sets the field delimiter character.
870 *
871 * @param delim the delimiter character to use
872 * @return this, to enable chaining
873 */
874 public StrTokenizer setDelimiterChar(final char delim) {
875 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
876 }
877
878 /**
879 * Sets the field delimiter string.
880 *
881 * @param delim the delimiter string to use
882 * @return this, to enable chaining
883 */
884 public StrTokenizer setDelimiterString(final String delim) {
885 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
886 }
887
888 // Quote
889 //-----------------------------------------------------------------------
890 /**
891 * Gets the quote matcher currently in use.
892 * <p>
893 * The quote character is used to wrap data between the tokens.
894 * This enables delimiters to be entered as data.
895 * The default value is '"' (double quote).
896 *
897 * @return the quote matcher in use
898 */
899 public StrMatcher getQuoteMatcher() {
900 return quoteMatcher;
901 }
902
903 /**
904 * Set the quote matcher to use.
905 * <p>
906 * The quote character is used to wrap data between the tokens.
907 * This enables delimiters to be entered as data.
908 *
909 * @param quote the quote matcher to use, null ignored
910 * @return this, to enable chaining
911 */
912 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
913 if (quote != null) {
914 this.quoteMatcher = quote;
915 }
916 return this;
917 }
918
919 /**
920 * Sets the quote character to use.
921 * <p>
922 * The quote character is used to wrap data between the tokens.
923 * This enables delimiters to be entered as data.
924 *
925 * @param quote the quote character to use
926 * @return this, to enable chaining
927 */
928 public StrTokenizer setQuoteChar(final char quote) {
929 return setQuoteMatcher(StrMatcher.charMatcher(quote));
930 }
931
932 // Ignored
933 //-----------------------------------------------------------------------
934 /**
935 * Gets the ignored character matcher.
936 * <p>
937 * These characters are ignored when parsing the String, unless they are
938 * within a quoted region.
939 * The default value is not to ignore anything.
940 *
941 * @return the ignored matcher in use
942 */
943 public StrMatcher getIgnoredMatcher() {
944 return ignoredMatcher;
945 }
946
947 /**
948 * Set the matcher for characters to ignore.
949 * <p>
950 * These characters are ignored when parsing the String, unless they are
951 * within a quoted region.
952 *
953 * @param ignored the ignored matcher to use, null ignored
954 * @return this, to enable chaining
955 */
956 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
957 if (ignored != null) {
958 this.ignoredMatcher = ignored;
959 }
960 return this;
961 }
962
963 /**
964 * Set the character to ignore.
965 * <p>
966 * This character is ignored when parsing the String, unless it is
967 * within a quoted region.
968 *
969 * @param ignored the ignored character to use
970 * @return this, to enable chaining
971 */
972 public StrTokenizer setIgnoredChar(final char ignored) {
973 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974 }
975
976 // Trimmer
977 //-----------------------------------------------------------------------
978 /**
979 * Gets the trimmer character matcher.
980 * <p>
981 * These characters are trimmed off on each side of the delimiter
982 * until the token or quote is found.
983 * The default value is not to trim anything.
984 *
985 * @return the trimmer matcher in use
986 */
987 public StrMatcher getTrimmerMatcher() {
988 return trimmerMatcher;
989 }
990
991 /**
992 * Sets the matcher for characters to trim.
993 * <p>
994 * These characters are trimmed off on each side of the delimiter
995 * until the token or quote is found.
996 *
997 * @param trimmer the trimmer matcher to use, null ignored
998 * @return this, to enable chaining
999 */
1000 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1001 if (trimmer != null) {
1002 this.trimmerMatcher = trimmer;
1003 }
1004 return this;
1005 }
1006
1007 //-----------------------------------------------------------------------
1008 /**
1009 * Gets whether the tokenizer currently returns empty tokens as null.
1010 * The default for this property is false.
1011 *
1012 * @return true if empty tokens are returned as null
1013 */
1014 public boolean isEmptyTokenAsNull() {
1015 return this.emptyAsNull;
1016 }
1017
1018 /**
1019 * Sets whether the tokenizer should return empty tokens as null.
1020 * The default for this property is false.
1021 *
1022 * @param emptyAsNull whether empty tokens are returned as null
1023 * @return this, to enable chaining
1024 */
1025 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1026 this.emptyAsNull = emptyAsNull;
1027 return this;
1028 }
1029
1030 //-----------------------------------------------------------------------
1031 /**
1032 * Gets whether the tokenizer currently ignores empty tokens.
1033 * The default for this property is true.
1034 *
1035 * @return true if empty tokens are not returned
1036 */
1037 public boolean isIgnoreEmptyTokens() {
1038 return ignoreEmptyTokens;
1039 }
1040
1041 /**
1042 * Sets whether the tokenizer should ignore and not return empty tokens.
1043 * The default for this property is true.
1044 *
1045 * @param ignoreEmptyTokens whether empty tokens are not returned
1046 * @return this, to enable chaining
1047 */
1048 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1049 this.ignoreEmptyTokens = ignoreEmptyTokens;
1050 return this;
1051 }
1052
1053 //-----------------------------------------------------------------------
1054 /**
1055 * Gets the String content that the tokenizer is parsing.
1056 *
1057 * @return the string content being parsed
1058 */
1059 public String getContent() {
1060 if (chars == null) {
1061 return null;
1062 }
1063 return new String(chars);
1064 }
1065
1066 //-----------------------------------------------------------------------
1067 /**
1068 * Creates a new instance of this Tokenizer. The new instance is reset so
1069 * that it will be at the start of the token list.
1070 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1071 *
1072 * @return a new instance of this Tokenizer which has been reset.
1073 */
1074 @Override
1075 public Object clone() {
1076 try {
1077 return cloneReset();
1078 } catch (final CloneNotSupportedException ex) {
1079 return null;
1080 }
1081 }
1082
1083 /**
1084 * Creates a new instance of this Tokenizer. The new instance is reset so that
1085 * it will be at the start of the token list.
1086 *
1087 * @return a new instance of this Tokenizer which has been reset.
1088 * @throws CloneNotSupportedException if there is a problem cloning
1089 */
1090 Object cloneReset() throws CloneNotSupportedException {
1091 // this method exists to enable 100% test coverage
1092 final StrTokenizer cloned = (StrTokenizer) super.clone();
1093 if (cloned.chars != null) {
1094 cloned.chars = cloned.chars.clone();
1095 }
1096 cloned.reset();
1097 return cloned;
1098 }
1099
1100 //-----------------------------------------------------------------------
1101 /**
1102 * Gets the String content that the tokenizer is parsing.
1103 *
1104 * @return the string content being parsed
1105 */
1106 @Override
1107 public String toString() {
1108 if (tokens == null) {
1109 return "StrTokenizer[not tokenized yet]";
1110 }
1111 return "StrTokenizer" + getTokenList();
1112 }
1113
1114 }