1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text;
18
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.NoSuchElementException;
24
25 /**
26 * Tokenizes a string based based on delimiters (separators)
27 * and supporting quoting and ignored character concepts.
28 * <p>
29 * This class can split a String into many smaller strings. It aims
30 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
31 * however it offers much more control and flexibility including implementing
32 * the <code>ListIterator</code> interface. By default, it is set up
33 * like <code>StringTokenizer</code>.
34 * <p>
35 * The input String is split into a number of <i>tokens</i>.
36 * Each token is separated from the next String by a <i>delimiter</i>.
37 * One or more delimiter characters must be specified.
38 * <p>
39 * Each token may be surrounded by quotes.
40 * The <i>quote</i> matcher specifies the quote character(s).
41 * A quote may be escaped within a quoted section by duplicating itself.
42 * <p>
43 * Between each token and the delimiter are potentially characters that need trimming.
44 * The <i>trimmer</i> matcher specifies these characters.
45 * One usage might be to trim whitespace characters.
46 * <p>
47 * At any point outside the quotes there might potentially be invalid characters.
48 * The <i>ignored</i> matcher specifies these characters to be removed.
49 * One usage might be to remove new line characters.
50 * <p>
51 * Empty tokens may be removed or returned as null.
52 * <pre>
53 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
54 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
55 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56 * </pre>
57 * <p>
58 *
59 * This tokenizer has the following properties and options:
60 *
61 * <table summary="Tokenizer Properties">
62 * <tr>
63 * <th>Property</th><th>Type</th><th>Default</th>
64 * </tr>
65 * <tr>
66 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
67 * </tr>
68 * <tr>
69 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
70 * </tr>
71 * <tr>
72 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
73 * </tr>
74 * <tr>
75 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
76 * </tr>
77 * <tr>
78 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
79 * </tr>
80 * </table>
81 *
82 * @since 1.0
83 */
84 public class StrTokenizer implements ListIterator<String>, Cloneable {
85
86 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
87 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
88 static {
89 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
90 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
91 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
92 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
93 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
94 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
95 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
96
97 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
98 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
99 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
100 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
101 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
102 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
103 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
104 }
105
106 /** The text to work on. */
107 private char chars[];
108 /** The parsed tokens */
109 private String tokens[];
110 /** The current iteration position */
111 private int tokenPos;
112
113 /** The delimiter matcher */
114 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
115 /** The quote matcher */
116 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
117 /** The ignored matcher */
118 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
119 /** The trimmer matcher */
120 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
121
122 /** Whether to return empty tokens as null */
123 private boolean emptyAsNull = false;
124 /** Whether to ignore empty tokens */
125 private boolean ignoreEmptyTokens = true;
126
127 //-----------------------------------------------------------------------
128
129 /**
130 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
131 *
132 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
133 */
134 private static StrTokenizer getCSVClone() {
135 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
136 }
137
138 /**
139 * Gets a new tokenizer instance which parses Comma Separated Value strings
140 * initializing it with the given input. The default for CSV processing
141 * will be trim whitespace from both ends (which can be overridden with
142 * the setTrimmer method).
143 * <p>
144 * You must call a "reset" method to set the string which you want to parse.
145 * @return a new tokenizer instance which parses Comma Separated Value strings
146 */
147 public static StrTokenizer getCSVInstance() {
148 return getCSVClone();
149 }
150
151 /**
152 * Gets a new tokenizer instance which parses Comma Separated Value strings
153 * initializing it with the given input. The default for CSV processing
154 * will be trim whitespace from both ends (which can be overridden with
155 * the setTrimmer method).
156 *
157 * @param input the text to parse
158 * @return a new tokenizer instance which parses Comma Separated Value strings
159 */
160 public static StrTokenizer getCSVInstance(final String input) {
161 final StrTokenizer tok = getCSVClone();
162 tok.reset(input);
163 return tok;
164 }
165
166 /**
167 * Gets a new tokenizer instance which parses Comma Separated Value strings
168 * initializing it with the given input. The default for CSV processing
169 * will be trim whitespace from both ends (which can be overridden with
170 * the setTrimmer method).
171 *
172 * @param input the text to parse
173 * @return a new tokenizer instance which parses Comma Separated Value strings
174 */
175 public static StrTokenizer getCSVInstance(final char[] input) {
176 final StrTokenizer tok = getCSVClone();
177 tok.reset(input);
178 return tok;
179 }
180
181 /**
182 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
183 *
184 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
185 */
186 private static StrTokenizer getTSVClone() {
187 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
188 }
189
190
191 /**
192 * Gets a new tokenizer instance which parses Tab Separated Value strings.
193 * The default for CSV processing will be trim whitespace from both ends
194 * (which can be overridden with the setTrimmer method).
195 * <p>
196 * You must call a "reset" method to set the string which you want to parse.
197 * @return a new tokenizer instance which parses Tab Separated Value strings.
198 */
199 public static StrTokenizer getTSVInstance() {
200 return getTSVClone();
201 }
202
203 /**
204 * Gets a new tokenizer instance which parses Tab Separated Value strings.
205 * The default for CSV processing will be trim whitespace from both ends
206 * (which can be overridden with the setTrimmer method).
207 * @param input the string to parse
208 * @return a new tokenizer instance which parses Tab Separated Value strings.
209 */
210 public static StrTokenizer getTSVInstance(final String input) {
211 final StrTokenizer tok = getTSVClone();
212 tok.reset(input);
213 return tok;
214 }
215
216 /**
217 * Gets a new tokenizer instance which parses Tab Separated Value strings.
218 * The default for CSV processing will be trim whitespace from both ends
219 * (which can be overridden with the setTrimmer method).
220 * @param input the string to parse
221 * @return a new tokenizer instance which parses Tab Separated Value strings.
222 */
223 public static StrTokenizer getTSVInstance(final char[] input) {
224 final StrTokenizer tok = getTSVClone();
225 tok.reset(input);
226 return tok;
227 }
228
229 //-----------------------------------------------------------------------
230 /**
231 * Constructs a tokenizer splitting on space, tab, newline and formfeed
232 * as per StringTokenizer, but with no text to tokenize.
233 * <p>
234 * This constructor is normally used with {@link #reset(String)}.
235 */
236 public StrTokenizer() {
237 super();
238 this.chars = null;
239 }
240
241 /**
242 * Constructs a tokenizer splitting on space, tab, newline and formfeed
243 * as per StringTokenizer.
244 *
245 * @param input the string which is to be parsed
246 */
247 public StrTokenizer(final String input) {
248 super();
249 if (input != null) {
250 chars = input.toCharArray();
251 } else {
252 chars = null;
253 }
254 }
255
256 /**
257 * Constructs a tokenizer splitting on the specified delimiter character.
258 *
259 * @param input the string which is to be parsed
260 * @param delim the field delimiter character
261 */
262 public StrTokenizer(final String input, final char delim) {
263 this(input);
264 setDelimiterChar(delim);
265 }
266
267 /**
268 * Constructs a tokenizer splitting on the specified delimiter string.
269 *
270 * @param input the string which is to be parsed
271 * @param delim the field delimiter string
272 */
273 public StrTokenizer(final String input, final String delim) {
274 this(input);
275 setDelimiterString(delim);
276 }
277
278 /**
279 * Constructs a tokenizer splitting using the specified delimiter matcher.
280 *
281 * @param input the string which is to be parsed
282 * @param delim the field delimiter matcher
283 */
284 public StrTokenizer(final String input, final StrMatcher delim) {
285 this(input);
286 setDelimiterMatcher(delim);
287 }
288
289 /**
290 * Constructs a tokenizer splitting on the specified delimiter character
291 * and handling quotes using the specified quote character.
292 *
293 * @param input the string which is to be parsed
294 * @param delim the field delimiter character
295 * @param quote the field quoted string character
296 */
297 public StrTokenizer(final String input, final char delim, final char quote) {
298 this(input, delim);
299 setQuoteChar(quote);
300 }
301
302 /**
303 * Constructs a tokenizer splitting using the specified delimiter matcher
304 * and handling quotes using the specified quote matcher.
305 *
306 * @param input the string which is to be parsed
307 * @param delim the field delimiter matcher
308 * @param quote the field quoted string matcher
309 */
310 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
311 this(input, delim);
312 setQuoteMatcher(quote);
313 }
314
315 /**
316 * Constructs a tokenizer splitting on space, tab, newline and formfeed
317 * as per StringTokenizer.
318 *
319 * @param input the string which is to be parsed, not cloned
320 */
321 public StrTokenizer(final char[] input) {
322 super();
323 if (input == null) {
324 this.chars = null;
325 } else {
326 this.chars = input.clone();
327 }
328 }
329
330 /**
331 * Constructs a tokenizer splitting on the specified character.
332 *
333 * @param input the string which is to be parsed, not cloned
334 * @param delim the field delimiter character
335 */
336 public StrTokenizer(final char[] input, final char delim) {
337 this(input);
338 setDelimiterChar(delim);
339 }
340
341 /**
342 * Constructs a tokenizer splitting on the specified string.
343 *
344 * @param input the string which is to be parsed, not cloned
345 * @param delim the field delimiter string
346 */
347 public StrTokenizer(final char[] input, final String delim) {
348 this(input);
349 setDelimiterString(delim);
350 }
351
352 /**
353 * Constructs a tokenizer splitting using the specified delimiter matcher.
354 *
355 * @param input the string which is to be parsed, not cloned
356 * @param delim the field delimiter matcher
357 */
358 public StrTokenizer(final char[] input, final StrMatcher delim) {
359 this(input);
360 setDelimiterMatcher(delim);
361 }
362
363 /**
364 * Constructs a tokenizer splitting on the specified delimiter character
365 * and handling quotes using the specified quote character.
366 *
367 * @param input the string which is to be parsed, not cloned
368 * @param delim the field delimiter character
369 * @param quote the field quoted string character
370 */
371 public StrTokenizer(final char[] input, final char delim, final char quote) {
372 this(input, delim);
373 setQuoteChar(quote);
374 }
375
376 /**
377 * Constructs a tokenizer splitting using the specified delimiter matcher
378 * and handling quotes using the specified quote matcher.
379 *
380 * @param input the string which is to be parsed, not cloned
381 * @param delim the field delimiter character
382 * @param quote the field quoted string character
383 */
384 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
385 this(input, delim);
386 setQuoteMatcher(quote);
387 }
388
389 // API
390 //-----------------------------------------------------------------------
391 /**
392 * Gets the number of tokens found in the String.
393 *
394 * @return the number of matched tokens
395 */
396 public int size() {
397 checkTokenized();
398 return tokens.length;
399 }
400
401 /**
402 * Gets the next token from the String.
403 * Equivalent to {@link #next()} except it returns null rather than
404 * throwing {@link NoSuchElementException} when no tokens remain.
405 *
406 * @return the next sequential token, or null when no more tokens are found
407 */
408 public String nextToken() {
409 if (hasNext()) {
410 return tokens[tokenPos++];
411 }
412 return null;
413 }
414
415 /**
416 * Gets the previous token from the String.
417 *
418 * @return the previous sequential token, or null when no more tokens are found
419 */
420 public String previousToken() {
421 if (hasPrevious()) {
422 return tokens[--tokenPos];
423 }
424 return null;
425 }
426
427 /**
428 * Gets a copy of the full token list as an independent modifiable array.
429 *
430 * @return the tokens as a String array
431 */
432 public String[] getTokenArray() {
433 checkTokenized();
434 return tokens.clone();
435 }
436
437 /**
438 * Gets a copy of the full token list as an independent modifiable list.
439 *
440 * @return the tokens as a String array
441 */
442 public List<String> getTokenList() {
443 checkTokenized();
444 final List<String> list = new ArrayList<>(tokens.length);
445 for (final String element : tokens) {
446 list.add(element);
447 }
448 return list;
449 }
450
451 /**
452 * Resets this tokenizer, forgetting all parsing and iteration already completed.
453 * <p>
454 * This method allows the same tokenizer to be reused for the same String.
455 *
456 * @return this, to enable chaining
457 */
458 public org.apache.commons.text.StrTokenizer reset() {
459 tokenPos = 0;
460 tokens = null;
461 return this;
462 }
463
464 /**
465 * Reset this tokenizer, giving it a new input string to parse.
466 * In this manner you can re-use a tokenizer with the same settings
467 * on multiple input lines.
468 *
469 * @param input the new string to tokenize, null sets no text to parse
470 * @return this, to enable chaining
471 */
472 public org.apache.commons.text.StrTokenizer reset(final String input) {
473 reset();
474 if (input != null) {
475 this.chars = input.toCharArray();
476 } else {
477 this.chars = null;
478 }
479 return this;
480 }
481
482 /**
483 * Reset this tokenizer, giving it a new input string to parse.
484 * In this manner you can re-use a tokenizer with the same settings
485 * on multiple input lines.
486 *
487 * @param input the new character array to tokenize, not cloned, null sets no text to parse
488 * @return this, to enable chaining
489 */
490 public org.apache.commons.text.StrTokenizer reset(final char[] input) {
491 reset();
492 if (input != null) {
493 this.chars = input;
494 } else {
495 this.chars = null;
496 }
497 return this;
498 }
499
500 // ListIterator
501 //-----------------------------------------------------------------------
502 /**
503 * Checks whether there are any more tokens.
504 *
505 * @return true if there are more tokens
506 */
507 @Override
508 public boolean hasNext() {
509 checkTokenized();
510 return tokenPos < tokens.length;
511 }
512
513 /**
514 * Gets the next token.
515 *
516 * @return the next String token
517 * @throws NoSuchElementException if there are no more elements
518 */
519 @Override
520 public String next() {
521 if (hasNext()) {
522 return tokens[tokenPos++];
523 }
524 throw new NoSuchElementException();
525 }
526
527 /**
528 * Gets the index of the next token to return.
529 *
530 * @return the next token index
531 */
532 @Override
533 public int nextIndex() {
534 return tokenPos;
535 }
536
537 /**
538 * Checks whether there are any previous tokens that can be iterated to.
539 *
540 * @return true if there are previous tokens
541 */
542 @Override
543 public boolean hasPrevious() {
544 checkTokenized();
545 return tokenPos > 0;
546 }
547
548 /**
549 * Gets the token previous to the last returned token.
550 *
551 * @return the previous token
552 */
553 @Override
554 public String previous() {
555 if (hasPrevious()) {
556 return tokens[--tokenPos];
557 }
558 throw new NoSuchElementException();
559 }
560
561 /**
562 * Gets the index of the previous token.
563 *
564 * @return the previous token index
565 */
566 @Override
567 public int previousIndex() {
568 return tokenPos - 1;
569 }
570
571 /**
572 * Unsupported ListIterator operation.
573 *
574 * @throws UnsupportedOperationException always
575 */
576 @Override
577 public void remove() {
578 throw new UnsupportedOperationException("remove() is unsupported");
579 }
580
581 /**
582 * Unsupported ListIterator operation.
583 * @param obj this parameter ignored.
584 * @throws UnsupportedOperationException always
585 */
586 @Override
587 public void set(final String obj) {
588 throw new UnsupportedOperationException("set() is unsupported");
589 }
590
591 /**
592 * Unsupported ListIterator operation.
593 * @param obj this parameter ignored.
594 * @throws UnsupportedOperationException always
595 */
596 @Override
597 public void add(final String obj) {
598 throw new UnsupportedOperationException("add() is unsupported");
599 }
600
601 // Implementation
602 //-----------------------------------------------------------------------
603 /**
604 * Checks if tokenization has been done, and if not then do it.
605 */
606 private void checkTokenized() {
607 if (tokens == null) {
608 if (chars == null) {
609 // still call tokenize as subclass may do some work
610 final List<String> split = tokenize(null, 0, 0);
611 tokens = split.toArray(new String[split.size()]);
612 } else {
613 final List<String> split = tokenize(chars, 0, chars.length);
614 tokens = split.toArray(new String[split.size()]);
615 }
616 }
617 }
618
619 /**
620 * Internal method to performs the tokenization.
621 * <p>
622 * Most users of this class do not need to call this method. This method
623 * will be called automatically by other (public) methods when required.
624 * <p>
625 * This method exists to allow subclasses to add code before or after the
626 * tokenization. For example, a subclass could alter the character array,
627 * offset or count to be parsed, or call the tokenizer multiple times on
628 * multiple strings. It is also be possible to filter the results.
629 * <p>
630 * <code>StrTokenizer</code> will always pass a zero offset and a count
631 * equal to the length of the array to this method, however a subclass
632 * may pass other values, or even an entirely different array.
633 *
634 * @param srcChars the character array being tokenized, may be null
635 * @param offset the start position within the character array, must be valid
636 * @param count the number of characters to tokenize, must be valid
637 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
638 */
639 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
640 if (srcChars == null || count == 0) {
641 return Collections.emptyList();
642 }
643 final StrBuilder buf = new StrBuilder();
644 final List<String> tokenList = new ArrayList<>();
645 int pos = offset;
646
647 // loop around the entire buffer
648 while (pos >= 0 && pos < count) {
649 // find next token
650 pos = readNextToken(srcChars, pos, count, buf, tokenList);
651
652 // handle case where end of string is a delimiter
653 if (pos >= count) {
654 addToken(tokenList, "");
655 }
656 }
657 return tokenList;
658 }
659
660 /**
661 * Adds a token to a list, paying attention to the parameters we've set.
662 *
663 * @param list the list to add to
664 * @param tok the token to add
665 */
666 private void addToken(final List<String> list, String tok) {
667 if (tok == null || tok.length() == 0) {
668 if (isIgnoreEmptyTokens()) {
669 return;
670 }
671 if (isEmptyTokenAsNull()) {
672 tok = null;
673 }
674 }
675 list.add(tok);
676 }
677
678 /**
679 * Reads character by character through the String to get the next token.
680 *
681 * @param srcChars the character array being tokenized
682 * @param start the first character of field
683 * @param len the length of the character array being tokenized
684 * @param workArea a temporary work area
685 * @param tokenList the list of parsed tokens
686 * @return the starting position of the next field (the character
687 * immediately after the delimiter), or -1 if end of string found
688 */
689 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
690 // skip all leading whitespace, unless it is the
691 // field delimiter or the quote character
692 while (start < len) {
693 final int removeLen = Math.max(
694 getIgnoredMatcher().isMatch(srcChars, start, start, len),
695 getTrimmerMatcher().isMatch(srcChars, start, start, len));
696 if (removeLen == 0 ||
697 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
698 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
699 break;
700 }
701 start += removeLen;
702 }
703
704 // handle reaching end
705 if (start >= len) {
706 addToken(tokenList, "");
707 return -1;
708 }
709
710 // handle empty token
711 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
712 if (delimLen > 0) {
713 addToken(tokenList, "");
714 return start + delimLen;
715 }
716
717 // handle found token
718 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
719 if (quoteLen > 0) {
720 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
721 }
722 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
723 }
724
725 /**
726 * Reads a possibly quoted string token.
727 *
728 * @param srcChars the character array being tokenized
729 * @param start the first character of field
730 * @param len the length of the character array being tokenized
731 * @param workArea a temporary work area
732 * @param tokenList the list of parsed tokens
733 * @param quoteStart the start position of the matched quote, 0 if no quoting
734 * @param quoteLen the length of the matched quote, 0 if no quoting
735 * @return the starting position of the next field (the character
736 * immediately after the delimiter, or if end of string found,
737 * then the length of string
738 */
739 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
740 final List<String> tokenList, final int quoteStart, final int quoteLen) {
741 // Loop until we've found the end of the quoted
742 // string or the end of the input
743 workArea.clear();
744 int pos = start;
745 boolean quoting = quoteLen > 0;
746 int trimStart = 0;
747
748 while (pos < len) {
749 // quoting mode can occur several times throughout a string
750 // we must switch between quoting and non-quoting until we
751 // encounter a non-quoted delimiter, or end of string
752 if (quoting) {
753 // In quoting mode
754
755 // If we've found a quote character, see if it's
756 // followed by a second quote. If so, then we need
757 // to actually put the quote character into the token
758 // rather than end the token.
759 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
760 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
761 // matched pair of quotes, thus an escaped quote
762 workArea.append(srcChars, pos, quoteLen);
763 pos += quoteLen * 2;
764 trimStart = workArea.size();
765 continue;
766 }
767
768 // end of quoting
769 quoting = false;
770 pos += quoteLen;
771 continue;
772 }
773
774 // copy regular character from inside quotes
775 workArea.append(srcChars[pos++]);
776 trimStart = workArea.size();
777
778 } else {
779 // Not in quoting mode
780
781 // check for delimiter, and thus end of token
782 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
783 if (delimLen > 0) {
784 // return condition when end of token found
785 addToken(tokenList, workArea.substring(0, trimStart));
786 return pos + delimLen;
787 }
788
789 // check for quote, and thus back into quoting mode
790 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
791 quoting = true;
792 pos += quoteLen;
793 continue;
794 }
795
796 // check for ignored (outside quotes), and ignore
797 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
798 if (ignoredLen > 0) {
799 pos += ignoredLen;
800 continue;
801 }
802
803 // check for trimmed character
804 // don't yet know if its at the end, so copy to workArea
805 // use trimStart to keep track of trim at the end
806 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
807 if (trimmedLen > 0) {
808 workArea.append(srcChars, pos, trimmedLen);
809 pos += trimmedLen;
810 continue;
811 }
812
813 // copy regular character from outside quotes
814 workArea.append(srcChars[pos++]);
815 trimStart = workArea.size();
816 }
817 }
818
819 // return condition when end of string found
820 addToken(tokenList, workArea.substring(0, trimStart));
821 return -1;
822 }
823
824 /**
825 * Checks if the characters at the index specified match the quote
826 * already matched in readNextToken().
827 *
828 * @param srcChars the character array being tokenized
829 * @param pos the position to check for a quote
830 * @param len the length of the character array being tokenized
831 * @param quoteStart the start position of the matched quote, 0 if no quoting
832 * @param quoteLen the length of the matched quote, 0 if no quoting
833 * @return true if a quote is matched
834 */
835 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
836 for (int i = 0; i < quoteLen; i++) {
837 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
838 return false;
839 }
840 }
841 return true;
842 }
843
844 // Delimiter
845 //-----------------------------------------------------------------------
846 /**
847 * Gets the field delimiter matcher.
848 *
849 * @return the delimiter matcher in use
850 */
851 public StrMatcher getDelimiterMatcher() {
852 return this.delimMatcher;
853 }
854
855 /**
856 * Sets the field delimiter matcher.
857 * <p>
858 * The delimitier is used to separate one token from another.
859 *
860 * @param delim the delimiter matcher to use
861 * @return this, to enable chaining
862 */
863 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
864 if (delim == null) {
865 this.delimMatcher = StrMatcher.noneMatcher();
866 } else {
867 this.delimMatcher = delim;
868 }
869 return this;
870 }
871
872 /**
873 * Sets the field delimiter character.
874 *
875 * @param delim the delimiter character to use
876 * @return this, to enable chaining
877 */
878 public StrTokenizer setDelimiterChar(final char delim) {
879 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
880 }
881
882 /**
883 * Sets the field delimiter string.
884 *
885 * @param delim the delimiter string to use
886 * @return this, to enable chaining
887 */
888 public StrTokenizer setDelimiterString(final String delim) {
889 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
890 }
891
892 // Quote
893 //-----------------------------------------------------------------------
894 /**
895 * Gets the quote matcher currently in use.
896 * <p>
897 * The quote character is used to wrap data between the tokens.
898 * This enables delimiters to be entered as data.
899 * The default value is '"' (double quote).
900 *
901 * @return the quote matcher in use
902 */
903 public StrMatcher getQuoteMatcher() {
904 return quoteMatcher;
905 }
906
907 /**
908 * Set the quote matcher to use.
909 * <p>
910 * The quote character is used to wrap data between the tokens.
911 * This enables delimiters to be entered as data.
912 *
913 * @param quote the quote matcher to use, null ignored
914 * @return this, to enable chaining
915 */
916 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
917 if (quote != null) {
918 this.quoteMatcher = quote;
919 }
920 return this;
921 }
922
923 /**
924 * Sets the quote character to use.
925 * <p>
926 * The quote character is used to wrap data between the tokens.
927 * This enables delimiters to be entered as data.
928 *
929 * @param quote the quote character to use
930 * @return this, to enable chaining
931 */
932 public StrTokenizer setQuoteChar(final char quote) {
933 return setQuoteMatcher(StrMatcher.charMatcher(quote));
934 }
935
936 // Ignored
937 //-----------------------------------------------------------------------
938 /**
939 * Gets the ignored character matcher.
940 * <p>
941 * These characters are ignored when parsing the String, unless they are
942 * within a quoted region.
943 * The default value is not to ignore anything.
944 *
945 * @return the ignored matcher in use
946 */
947 public StrMatcher getIgnoredMatcher() {
948 return ignoredMatcher;
949 }
950
951 /**
952 * Set the matcher for characters to ignore.
953 * <p>
954 * These characters are ignored when parsing the String, unless they are
955 * within a quoted region.
956 *
957 * @param ignored the ignored matcher to use, null ignored
958 * @return this, to enable chaining
959 */
960 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
961 if (ignored != null) {
962 this.ignoredMatcher = ignored;
963 }
964 return this;
965 }
966
967 /**
968 * Set the character to ignore.
969 * <p>
970 * This character is ignored when parsing the String, unless it is
971 * within a quoted region.
972 *
973 * @param ignored the ignored character to use
974 * @return this, to enable chaining
975 */
976 public StrTokenizer setIgnoredChar(final char ignored) {
977 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
978 }
979
980 // Trimmer
981 //-----------------------------------------------------------------------
982 /**
983 * Gets the trimmer character matcher.
984 * <p>
985 * These characters are trimmed off on each side of the delimiter
986 * until the token or quote is found.
987 * The default value is not to trim anything.
988 *
989 * @return the trimmer matcher in use
990 */
991 public StrMatcher getTrimmerMatcher() {
992 return trimmerMatcher;
993 }
994
995 /**
996 * Sets the matcher for characters to trim.
997 * <p>
998 * These characters are trimmed off on each side of the delimiter
999 * until the token or quote is found.
1000 *
1001 * @param trimmer the trimmer matcher to use, null ignored
1002 * @return this, to enable chaining
1003 */
1004 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1005 if (trimmer != null) {
1006 this.trimmerMatcher = trimmer;
1007 }
1008 return this;
1009 }
1010
1011 //-----------------------------------------------------------------------
1012 /**
1013 * Gets whether the tokenizer currently returns empty tokens as null.
1014 * The default for this property is false.
1015 *
1016 * @return true if empty tokens are returned as null
1017 */
1018 public boolean isEmptyTokenAsNull() {
1019 return this.emptyAsNull;
1020 }
1021
1022 /**
1023 * Sets whether the tokenizer should return empty tokens as null.
1024 * The default for this property is false.
1025 *
1026 * @param emptyAsNull whether empty tokens are returned as null
1027 * @return this, to enable chaining
1028 */
1029 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1030 this.emptyAsNull = emptyAsNull;
1031 return this;
1032 }
1033
1034 //-----------------------------------------------------------------------
1035 /**
1036 * Gets whether the tokenizer currently ignores empty tokens.
1037 * The default for this property is true.
1038 *
1039 * @return true if empty tokens are not returned
1040 */
1041 public boolean isIgnoreEmptyTokens() {
1042 return ignoreEmptyTokens;
1043 }
1044
1045 /**
1046 * Sets whether the tokenizer should ignore and not return empty tokens.
1047 * The default for this property is true.
1048 *
1049 * @param ignoreEmptyTokens whether empty tokens are not returned
1050 * @return this, to enable chaining
1051 */
1052 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1053 this.ignoreEmptyTokens = ignoreEmptyTokens;
1054 return this;
1055 }
1056
1057 //-----------------------------------------------------------------------
1058 /**
1059 * Gets the String content that the tokenizer is parsing.
1060 *
1061 * @return the string content being parsed
1062 */
1063 public String getContent() {
1064 if (chars == null) {
1065 return null;
1066 }
1067 return new String(chars);
1068 }
1069
1070 //-----------------------------------------------------------------------
1071 /**
1072 * Creates a new instance of this Tokenizer. The new instance is reset so
1073 * that it will be at the start of the token list.
1074 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1075 *
1076 * @return a new instance of this Tokenizer which has been reset.
1077 */
1078 @Override
1079 public Object clone() {
1080 try {
1081 return cloneReset();
1082 } catch (final CloneNotSupportedException ex) {
1083 return null;
1084 }
1085 }
1086
1087 /**
1088 * Creates a new instance of this Tokenizer. The new instance is reset so that
1089 * it will be at the start of the token list.
1090 *
1091 * @return a new instance of this Tokenizer which has been reset.
1092 * @throws CloneNotSupportedException if there is a problem cloning
1093 */
1094 Object cloneReset() throws CloneNotSupportedException {
1095 // this method exists to enable 100% test coverage
1096 final StrTokenizer cloned = (StrTokenizer) super.clone();
1097 if (cloned.chars != null) {
1098 cloned.chars = cloned.chars.clone();
1099 }
1100 cloned.reset();
1101 return cloned;
1102 }
1103
1104 //-----------------------------------------------------------------------
1105 /**
1106 * Gets the String content that the tokenizer is parsing.
1107 *
1108 * @return the string content being parsed
1109 */
1110 @Override
1111 public String toString() {
1112 if (tokens == null) {
1113 return "StrTokenizer[not tokenized yet]";
1114 }
1115 return "StrTokenizer" + getTokenList();
1116 }
1117
1118 }