1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.ListIterator;
24 import java.util.NoSuchElementException;
25
26 import org.apache.commons.lang3.ArrayUtils;
27 import org.apache.commons.lang3.StringUtils;
28 import org.apache.commons.text.matcher.StringMatcher;
29 import org.apache.commons.text.matcher.StringMatcherFactory;
30
31 /**
32 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
33 * <p>
34 * This class can split a String into many smaller strings. It aims to do a similar job to
35 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
36 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
37 * <p>
38 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
39 * <em>delimiter</em>. One or more delimiter characters must be specified.
40 * <p>
41 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
42 * escaped within a quoted section by duplicating itself.
43 * <p>
44 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
45 * specifies these characters. One usage might be to trim whitespace characters.
46 * <p>
47 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
48 * these characters to be removed. One usage might be to remove new line characters.
49 * <p>
50 * Empty tokens may be removed or returned as null.
51 *
52 * <pre>
53 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
54 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
55 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56 * </pre>
57 *
58 * <table>
59 * <caption>StringTokenizer properties and options</caption>
60 * <tr>
61 * <th>Property</th>
62 * <th>Type</th>
63 * <th>Default</th>
64 * </tr>
65 * <tr>
66 * <td>delim</td>
67 * <td>CharSetMatcher</td>
68 * <td>{ \t\n\r\f}</td>
69 * </tr>
70 * <tr>
71 * <td>quote</td>
72 * <td>NoneMatcher</td>
73 * <td>{}</td>
74 * </tr>
75 * <tr>
76 * <td>ignore</td>
77 * <td>NoneMatcher</td>
78 * <td>{}</td>
79 * </tr>
80 * <tr>
81 * <td>emptyTokenAsNull</td>
82 * <td>boolean</td>
83 * <td>false</td>
84 * </tr>
85 * <tr>
86 * <td>ignoreEmptyTokens</td>
87 * <td>boolean</td>
88 * <td>true</td>
89 * </tr>
90 * </table>
91 *
92 * @since 1.3
93 */
94 public class StringTokenizer implements ListIterator<String>, Cloneable {
95
96 /** Comma separated values tokenizer internal variable. */
97 // @formatter:off
98 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
99 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher())
100 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
101 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
102 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
103 .setEmptyTokenAsNull(false)
104 .setIgnoreEmptyTokens(false);
105 // @formatter:on
106
107 /** Tab separated values tokenizer internal variable. */
108 // @formatter:off
109 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
110 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher())
111 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
112 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
113 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
114 .setEmptyTokenAsNull(false)
115 .setIgnoreEmptyTokens(false);
116 // @formatter:on
117
118 /**
119 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
120 *
121 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122 */
123 private static StringTokenizer getCSVClone() {
124 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
125 }
126
127 /**
128 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
129 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
130 * setTrimmer method).
131 * <p>
132 * You must call a "reset" method to set the string which you want to parse.
133 * </p>
134 *
135 * @return a new tokenizer instance which parses Comma Separated Value strings.
136 */
137 public static StringTokenizer getCSVInstance() {
138 return getCSVClone();
139 }
140
141 /**
142 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
143 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
144 *
145 * @param input the text to parse.
146 * @return a new tokenizer instance which parses Comma Separated Value strings.
147 */
148 public static StringTokenizer getCSVInstance(final char[] input) {
149 return getCSVClone().reset(input);
150 }
151
152 /**
153 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
154 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
155 *
156 * @param input the text to parse.
157 * @return a new tokenizer instance which parses Comma Separated Value strings.
158 */
159 public static StringTokenizer getCSVInstance(final String input) {
160 return getCSVClone().reset(input);
161 }
162
163 /**
164 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
165 *
166 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167 */
168 private static StringTokenizer getTSVClone() {
169 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
170 }
171
172 /**
173 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
174 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
175 * <p>
176 * You must call a "reset" method to set the string which you want to parse.
177 * </p>
178 *
179 * @return a new tokenizer instance which parses Tab Separated Value strings.
180 */
181 public static StringTokenizer getTSVInstance() {
182 return getTSVClone();
183 }
184
185 /**
186 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
187 * be overridden with the setTrimmer method).
188 *
189 * @param input the string to parse.
190 * @return a new tokenizer instance which parses Tab Separated Value strings.
191 */
192 public static StringTokenizer getTSVInstance(final char[] input) {
193 return getTSVClone().reset(input);
194 }
195
196 /**
197 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
198 * be overridden with the setTrimmer method).
199 *
200 * @param input the string to parse.
201 * @return a new tokenizer instance which parses Tab Separated Value strings.
202 */
203 public static StringTokenizer getTSVInstance(final String input) {
204 return getTSVClone().reset(input);
205 }
206
207 /** The text to work on. */
208 private char[] chars;
209
210 /** The parsed tokens. */
211 private String[] tokens;
212
213 /** The current iteration position. */
214 private int tokenPos;
215
216 /** The delimiter matcher. */
217 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
218
219 /** The quote matcher. */
220 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
221
222 /** The ignored matcher. */
223 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
224
225 /** The trimmer matcher. */
226 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
227
228 /** Whether to return empty tokens as null. */
229 private boolean emptyAsNull;
230
231 /** Whether to ignore empty tokens. */
232 private boolean ignoreEmptyTokens = true;
233
234 /**
235 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236 * <p>
237 * This constructor is normally used with {@link #reset(String)}.
238 * </p>
239 */
240 public StringTokenizer() {
241 this.chars = null;
242 }
243
244 /**
245 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
246 *
247 * @param input the string which is to be parsed, not cloned.
248 */
249 public StringTokenizer(final char[] input) {
250 this.chars = input != null ? input.clone() : null;
251 }
252
253 /**
254 * Constructs a tokenizer splitting on the specified character.
255 *
256 * @param input the string which is to be parsed, not cloned.
257 * @param delim the field delimiter character.
258 */
259 public StringTokenizer(final char[] input, final char delim) {
260 this(input);
261 setDelimiterChar(delim);
262 }
263
264 /**
265 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character.
266 *
267 * @param input the string which is to be parsed, not cloned.
268 * @param delim the field delimiter character.
269 * @param quote the field quoted string character.
270 */
271 public StringTokenizer(final char[] input, final char delim, final char quote) {
272 this(input, delim);
273 setQuoteChar(quote);
274 }
275
276 /**
277 * Constructs a tokenizer splitting on the specified string.
278 *
279 * @param input the string which is to be parsed, not cloned.
280 * @param delim the field delimiter string.
281 */
282 public StringTokenizer(final char[] input, final String delim) {
283 this(input);
284 setDelimiterString(delim);
285 }
286
287 /**
288 * Constructs a tokenizer splitting using the specified delimiter matcher.
289 *
290 * @param input the string which is to be parsed, not cloned.
291 * @param delim the field delimiter matcher.
292 */
293 public StringTokenizer(final char[] input, final StringMatcher delim) {
294 this(input);
295 setDelimiterMatcher(delim);
296 }
297
298 /**
299 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher.
300 *
301 * @param input the string which is to be parsed, not cloned.
302 * @param delim the field delimiter character.
303 * @param quote the field quoted string character.
304 */
305 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
306 this(input, delim);
307 setQuoteMatcher(quote);
308 }
309
310 /**
311 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
312 *
313 * @param input the string which is to be parsed.
314 */
315 public StringTokenizer(final String input) {
316 this.chars = input != null ? input.toCharArray() : null;
317 }
318
319 /**
320 * Constructs a tokenizer splitting on the specified delimiter character.
321 *
322 * @param input the string which is to be parsed.
323 * @param delim the field delimiter character.
324 */
325 public StringTokenizer(final String input, final char delim) {
326 this(input);
327 setDelimiterChar(delim);
328 }
329
330 /**
331 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character.
332 *
333 * @param input the string which is to be parsed.
334 * @param delim the field delimiter character.
335 * @param quote the field quoted string character.
336 */
337 public StringTokenizer(final String input, final char delim, final char quote) {
338 this(input, delim);
339 setQuoteChar(quote);
340 }
341
342 /**
343 * Constructs a tokenizer splitting on the specified delimiter string.
344 *
345 * @param input the string which is to be parsed.
346 * @param delim the field delimiter string.
347 */
348 public StringTokenizer(final String input, final String delim) {
349 this(input);
350 setDelimiterString(delim);
351 }
352
353 /**
354 * Constructs a tokenizer splitting using the specified delimiter matcher.
355 *
356 * @param input the string which is to be parsed.
357 * @param delim the field delimiter matcher.
358 */
359 public StringTokenizer(final String input, final StringMatcher delim) {
360 this(input);
361 setDelimiterMatcher(delim);
362 }
363
364 /**
365 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher.
366 *
367 * @param input the string which is to be parsed.
368 * @param delim the field delimiter matcher.
369 * @param quote the field quoted string matcher.
370 */
371 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
372 this(input, delim);
373 setQuoteMatcher(quote);
374 }
375
376 /**
377 * Unsupported ListIterator operation.
378 *
379 * @param obj this parameter ignored.
380 * @throws UnsupportedOperationException always.
381 */
382 @Override
383 public void add(final String obj) {
384 throw new UnsupportedOperationException("add() is unsupported");
385 }
386
387 /**
388 * Adds a token to a list, paying attention to the parameters we've set.
389 *
390 * @param list the list to add to.
391 * @param tok the token to add.
392 */
393 private void addToken(final List<String> list, String tok) {
394 if (tok == null || tok.isEmpty()) {
395 if (isIgnoreEmptyTokens()) {
396 return;
397 }
398 if (isEmptyTokenAsNull()) {
399 tok = null;
400 }
401 }
402 list.add(tok);
403 }
404
405 /**
406 * Checks if tokenization has been done, and if not then do it.
407 */
408 private void checkTokenized() {
409 if (tokens == null) {
410 final List<String> split;
411 if (chars == null) {
412 // still call tokenize as subclass may do some work.
413 split = tokenize(null, 0, 0);
414 } else {
415 split = tokenize(chars, 0, chars.length);
416 }
417 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
418 }
419 }
420
421 /**
422 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
423 * {@link CloneNotSupportedException} is caught, return {@code null}.
424 *
425 * @return a new instance of this Tokenizer which has been reset.
426 */
427 @Override
428 public Object clone() {
429 try {
430 return cloneReset();
431 } catch (final CloneNotSupportedException ex) {
432 return null;
433 }
434 }
435
436 /**
437 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
438 *
439 * @return a new instance of this Tokenizer which has been reset.
440 * @throws CloneNotSupportedException if there is a problem cloning.
441 */
442 Object cloneReset() throws CloneNotSupportedException {
443 // this method exists to enable 100% test coverage
444 final StringTokenizer cloned = (StringTokenizer) super.clone();
445 if (cloned.chars != null) {
446 cloned.chars = cloned.chars.clone();
447 }
448 cloned.reset();
449 return cloned;
450 }
451
452 /**
453 * Gets the String content that the tokenizer is parsing.
454 *
455 * @return The string content being parsed.
456 */
457 public String getContent() {
458 if (chars == null) {
459 return null;
460 }
461 return new String(chars);
462 }
463
464 /**
465 * Gets the field delimiter matcher.
466 *
467 * @return The delimiter matcher in use.
468 */
469 public StringMatcher getDelimiterMatcher() {
470 return this.delimMatcher;
471 }
472
473 /**
474 * Gets the ignored character matcher.
475 * <p>
476 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
477 * </p>
478 *
479 * @return The ignored matcher in use.
480 */
481 public StringMatcher getIgnoredMatcher() {
482 return ignoredMatcher;
483 }
484
485 /**
486 * Gets the quote matcher currently in use.
487 * <p>
488 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
489 * </p>
490 *
491 * @return The quote matcher in use.
492 */
493 public StringMatcher getQuoteMatcher() {
494 return quoteMatcher;
495 }
496
497 /**
498 * Gets a copy of the full token list as an independent modifiable array.
499 *
500 * @return The tokens as a String array.
501 */
502 public String[] getTokenArray() {
503 checkTokenized();
504 return tokens.clone();
505 }
506
507 /**
508 * Gets a copy of the full token list as an independent modifiable list.
509 *
510 * @return The tokens as a String list.
511 */
512 public List<String> getTokenList() {
513 checkTokenized();
514 return new ArrayList<>(Arrays.asList(tokens));
515 }
516
517 /**
518 * Gets the trimmer character matcher.
519 * <p>
520 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
521 * </p>
522 *
523 * @return The trimmer matcher in use.
524 */
525 public StringMatcher getTrimmerMatcher() {
526 return trimmerMatcher;
527 }
528
529 /**
530 * Tests whether there are any more tokens.
531 *
532 * @return true if there are more tokens.
533 */
534 @Override
535 public boolean hasNext() {
536 checkTokenized();
537 return tokenPos < tokens.length;
538 }
539
540 /**
541 * Tests whether there are any previous tokens that can be iterated to.
542 *
543 * @return true if there are previous tokens.
544 */
545 @Override
546 public boolean hasPrevious() {
547 checkTokenized();
548 return tokenPos > 0;
549 }
550
551 /**
552 * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
553 *
554 * @return true if empty tokens are returned as null.
555 */
556 public boolean isEmptyTokenAsNull() {
557 return this.emptyAsNull;
558 }
559
560 /**
561 * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
562 *
563 * @return true if empty tokens are not returned.
564 */
565 public boolean isIgnoreEmptyTokens() {
566 return ignoreEmptyTokens;
567 }
568
569 /**
570 * Tests if the characters at the index specified match the quote already matched in readNextToken().
571 *
572 * @param srcChars the character array being tokenized.
573 * @param pos the position to check for a quote.
574 * @param len the length of the character array being tokenized.
575 * @param quoteStart the start position of the matched quote, 0 if no quoting.
576 * @param quoteLen the length of the matched quote, 0 if no quoting.
577 * @return true if a quote is matched.
578 */
579 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
580 for (int i = 0; i < quoteLen; i++) {
581 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
582 return false;
583 }
584 }
585 return true;
586 }
587
588 /**
589 * Gets the next token.
590 *
591 * @return The next String token.
592 * @throws NoSuchElementException if there are no more elements.
593 */
594 @Override
595 public String next() {
596 if (hasNext()) {
597 return tokens[tokenPos++];
598 }
599 throw new NoSuchElementException();
600 }
601
602 /**
603 * Gets the index of the next token to return.
604 *
605 * @return The next token index.
606 */
607 @Override
608 public int nextIndex() {
609 return tokenPos;
610 }
611
612 /**
613 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
614 * tokens remain.
615 *
616 * @return The next sequential token, or null when no more tokens are found.
617 */
618 public String nextToken() {
619 if (hasNext()) {
620 return tokens[tokenPos++];
621 }
622 return null;
623 }
624
625 /**
626 * Gets the token previous to the last returned token.
627 *
628 * @return The previous token.
629 */
630 @Override
631 public String previous() {
632 if (hasPrevious()) {
633 return tokens[--tokenPos];
634 }
635 throw new NoSuchElementException();
636 }
637
638 /**
639 * Gets the index of the previous token.
640 *
641 * @return The previous token index.
642 */
643 @Override
644 public int previousIndex() {
645 return tokenPos - 1;
646 }
647
648 /**
649 * Gets the previous token from the String.
650 *
651 * @return The previous sequential token, or null when no more tokens are found.
652 */
653 public String previousToken() {
654 if (hasPrevious()) {
655 return tokens[--tokenPos];
656 }
657 return null;
658 }
659
660 /**
661 * Reads character by character through the String to get the next token.
662 *
663 * @param srcChars the character array being tokenized.
664 * @param start the first character of field.
665 * @param len the length of the character array being tokenized.
666 * @param workArea a temporary work area.
667 * @param tokenList the list of parsed tokens.
668 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
669 */
670 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
671 final List<String> tokenList) {
672 // skip all leading whitespace, unless it is the
673 // field delimiter or the quote character
674 while (start < len) {
675 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
676 getTrimmerMatcher().isMatch(srcChars, start, start, len));
677 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
678 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
679 break;
680 }
681 start += removeLen;
682 }
683
684 // handle reaching end
685 if (start >= len) {
686 addToken(tokenList, StringUtils.EMPTY);
687 return -1;
688 }
689
690 // handle empty token
691 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
692 if (delimLen > 0) {
693 addToken(tokenList, StringUtils.EMPTY);
694 return start + delimLen;
695 }
696
697 // handle found token
698 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
699 if (quoteLen > 0) {
700 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
701 }
702 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
703 }
704
705 /**
706 * Reads a possibly quoted string token.
707 *
708 * @param srcChars the character array being tokenized.
709 * @param start the first character of field.
710 * @param len the length of the character array being tokenized.
711 * @param workArea a temporary work area.
712 * @param tokenList the list of parsed tokens.
713 * @param quoteStart the start position of the matched quote, 0 if no quoting.
714 * @param quoteLen the length of the matched quote, 0 if no quoting.
715 * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
716 */
717 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
718 final List<String> tokenList, final int quoteStart, final int quoteLen) {
719 // Loop until we've found the end of the quoted
720 // string or the end of the input
721 workArea.clear();
722 int pos = start;
723 boolean quoting = quoteLen > 0;
724 int trimStart = 0;
725
726 while (pos < len) {
727 // quoting mode can occur several times throughout a string
728 // we must switch between quoting and non-quoting until we
729 // encounter a non-quoted delimiter, or end of string
730 if (quoting) {
731 // In quoting mode
732
733 // If we've found a quote character, see if it's
734 // followed by a second quote. If so, then we need
735 // to actually put the quote character into the token
736 // rather than end the token.
737 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
738 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
739 // matched pair of quotes, thus an escaped quote
740 workArea.append(srcChars, pos, quoteLen);
741 pos += quoteLen * 2;
742 trimStart = workArea.size();
743 continue;
744 }
745
746 // end of quoting
747 quoting = false;
748 pos += quoteLen;
749 continue;
750 }
751
752 } else {
753 // Not in quoting mode
754
755 // check for delimiter, and thus end of token
756 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
757 if (delimLen > 0) {
758 // return condition when end of token found
759 addToken(tokenList, workArea.substring(0, trimStart));
760 return pos + delimLen;
761 }
762
763 // check for quote, and thus back into quoting mode
764 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
765 quoting = true;
766 pos += quoteLen;
767 continue;
768 }
769
770 // check for ignored (outside quotes), and ignore
771 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
772 if (ignoredLen > 0) {
773 pos += ignoredLen;
774 continue;
775 }
776
777 // check for trimmed character
778 // don't yet know if its at the end, so copy to workArea
779 // use trimStart to keep track of trim at the end
780 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
781 if (trimmedLen > 0) {
782 workArea.append(srcChars, pos, trimmedLen);
783 pos += trimmedLen;
784 continue;
785 }
786 }
787 // copy regular character from inside quotes
788 workArea.append(srcChars[pos++]);
789 trimStart = workArea.size();
790 }
791
792 // return condition when end of string found
793 addToken(tokenList, workArea.substring(0, trimStart));
794 return -1;
795 }
796
797 /**
798 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
799 *
800 * @throws UnsupportedOperationException Always thrown.
801 */
802 @Override
803 public void remove() {
804 throw new UnsupportedOperationException("remove() is unsupported");
805 }
806
807 /**
808 * Resets this tokenizer, forgetting all parsing and iteration already completed.
809 * <p>
810 * This method allows the same tokenizer to be reused for the same String.
811 * </p>
812 *
813 * @return {@code this} instance.
814 */
815 public StringTokenizer reset() {
816 tokenPos = 0;
817 tokens = null;
818 return this;
819 }
820
821 /**
822 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
823 *
824 * @param input the new character array to tokenize, not cloned, null sets no text to parse.
825 * @return {@code this} instance.
826 */
827 public StringTokenizer reset(final char[] input) {
828 reset();
829 this.chars = input != null ? input.clone() : null;
830 return this;
831 }
832
833 /**
834 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
835 *
836 * @param input the new string to tokenize, null sets no text to parse.
837 * @return {@code this} instance.
838 */
839 public StringTokenizer reset(final String input) {
840 reset();
841 this.chars = input != null ? input.toCharArray() : null;
842 return this;
843 }
844
845 /**
846 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
847 *
848 * @param obj this parameter ignored.
849 * @throws UnsupportedOperationException always.
850 */
851 @Override
852 public void set(final String obj) {
853 throw new UnsupportedOperationException("set() is unsupported");
854 }
855
856 /**
857 * Sets the field delimiter character.
858 *
859 * @param delim the delimiter character to use.
860 * @return {@code this} instance.
861 */
862 public StringTokenizer setDelimiterChar(final char delim) {
863 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
864 }
865
866 /**
867 * Sets the field delimiter matcher.
868 * <p>
869 * The delimiter is used to separate one token from another.
870 * </p>
871 *
872 * @param delim the delimiter matcher to use.
873 * @return {@code this} instance.
874 */
875 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
876 this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
877 return this;
878 }
879
880 /**
881 * Sets the field delimiter string.
882 *
883 * @param delim the delimiter string to use.
884 * @return {@code this} instance.
885 */
886 public StringTokenizer setDelimiterString(final String delim) {
887 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
888 }
889
890 /**
891 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
892 *
893 * @param emptyAsNull whether empty tokens are returned as null.
894 * @return {@code this} instance.
895 */
896 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
897 this.emptyAsNull = emptyAsNull;
898 return this;
899 }
900
901 /**
902 * Sets the character to ignore.
903 * <p>
904 * This character is ignored when parsing the String, unless it is within a quoted region.
905 * </p>
906 *
907 * @param ignored the ignored character to use.
908 * @return {@code this} instance.
909 */
910 public StringTokenizer setIgnoredChar(final char ignored) {
911 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
912 }
913
914 /**
915 * Sets the matcher for characters to ignore.
916 * <p>
917 * These characters are ignored when parsing the String, unless they are within a quoted region.
918 * </p>
919 *
920 * @param ignored the ignored matcher to use, null ignored.
921 * @return {@code this} instance.
922 */
923 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
924 if (ignored != null) {
925 this.ignoredMatcher = ignored;
926 }
927 return this;
928 }
929
930 /**
931 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
932 *
933 * @param ignoreEmptyTokens whether empty tokens are not returned.
934 * @return {@code this} instance.
935 */
936 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
937 this.ignoreEmptyTokens = ignoreEmptyTokens;
938 return this;
939 }
940
941 /**
942 * Sets the quote character to use.
943 * <p>
944 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
945 * </p>
946 *
947 * @param quote the quote character to use.
948 * @return {@code this} instance.
949 */
950 public StringTokenizer setQuoteChar(final char quote) {
951 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
952 }
953
954 /**
955 * Sets the quote matcher to use.
956 * <p>
957 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
958 * </p>
959 *
960 * @param quote the quote matcher to use, null ignored.
961 * @return {@code this} instance.
962 */
963 public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
964 if (quote != null) {
965 this.quoteMatcher = quote;
966 }
967 return this;
968 }
969
970 /**
971 * Sets the matcher for characters to trim.
972 * <p>
973 * These characters are trimmed off on each side of the delimiter until the token or quote is found.
974 *
975 * @param trimmer the trimmer matcher to use, null ignored.
976 * @return {@code this} instance.
977 */
978 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
979 if (trimmer != null) {
980 this.trimmerMatcher = trimmer;
981 }
982 return this;
983 }
984
985 /**
986 * Gets the number of tokens found in the String.
987 *
988 * @return The number of matched tokens.
989 */
990 public int size() {
991 checkTokenized();
992 return tokens.length;
993 }
994
995 /**
996 * Internal method to performs the tokenization.
997 * <p>
998 * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
999 * </p>
1000 * <p>
1001 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1002 * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1003 * </p>
1004 * <p>
1005 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1006 * values, or even an entirely different array.
1007 * </p>
1008 *
1009 * @param srcChars the character array being tokenized, may be null.
1010 * @param offset the start position within the character array, must be valid.
1011 * @param count the number of characters to tokenize, must be valid.
1012 * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1013 */
1014 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1015 if (srcChars == null || count == 0) {
1016 return Collections.emptyList();
1017 }
1018 final TextStringBuilder buf = new TextStringBuilder();
1019 final List<String> tokenList = new ArrayList<>();
1020 int pos = offset;
1021 // loop around the entire buffer
1022 while (pos >= 0 && pos < count) {
1023 // find next token
1024 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1025 // handle case where end of string is a delimiter
1026 if (pos >= count) {
1027 addToken(tokenList, StringUtils.EMPTY);
1028 }
1029 }
1030 return tokenList;
1031 }
1032
1033 /**
1034 * Gets the String content that the tokenizer is parsing.
1035 *
1036 * @return The string content being parsed.
1037 */
1038 @Override
1039 public String toString() {
1040 if (tokens == null) {
1041 return "StringTokenizer[not tokenized yet]";
1042 }
1043 return "StringTokenizer" + getTokenList();
1044 }
1045 }