1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text;
18
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.NoSuchElementException;
24
25 import org.apache.commons.lang3.ArrayUtils;
26 import org.apache.commons.lang3.StringUtils;
27
28 /**
29 * Tokenizes a string based on delimiters (separators)
30 * and supporting quoting and ignored character concepts.
31 * <p>
32 * This class can split a String into many smaller strings. It aims
33 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34 * however it offers much more control and flexibility including implementing
35 * the {@code ListIterator} interface. By default, it is set up
36 * like {@code StringTokenizer}.
37 * <p>
38 * The input String is split into a number of <em>tokens</em>.
39 * Each token is separated from the next String by a <em>delimiter</em>.
40 * One or more delimiter characters must be specified.
41 * <p>
42 * Each token may be surrounded by quotes.
43 * The <em>quote</em> matcher specifies the quote character(s).
44 * A quote may be escaped within a quoted section by duplicating itself.
45 * <p>
46 * Between each token and the delimiter are potentially characters that need trimming.
47 * The <em>trimmer</em> matcher specifies these characters.
48 * One usage might be to trim whitespace characters.
49 * <p>
50 * At any point outside the quotes there might potentially be invalid characters.
51 * The <em>ignored</em> matcher specifies these characters to be removed.
52 * One usage might be to remove new line characters.
53 * <p>
54 * Empty tokens may be removed or returned as null.
55 * <pre>
56 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
57 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
58 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59 * </pre>
60 *
61 * <table>
62 * <caption>StrTokenizer properties and options</caption>
63 * <tr>
64 * <th>Property</th><th>Type</th><th>Default</th>
65 * </tr>
66 * <tr>
67 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68 * </tr>
69 * <tr>
70 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
71 * </tr>
72 * <tr>
73 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74 * </tr>
75 * <tr>
76 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77 * </tr>
78 * <tr>
79 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80 * </tr>
81 * </table>
82 *
83 * @since 1.0
84 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85 */
86 @Deprecated
87 public class StrTokenizer implements ListIterator<String>, Cloneable {
88
89 /** Comma separated values tokenizer internal variable. */
90 // @formatter:off
91 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
92 .setDelimiterMatcher(StrMatcher.commaMatcher())
93 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
94 .setIgnoredMatcher(StrMatcher.noneMatcher())
95 .setTrimmerMatcher(StrMatcher.trimMatcher())
96 .setEmptyTokenAsNull(false)
97 .setIgnoreEmptyTokens(false);
98 // @formatter:on
99
100 /** Tab separated values tokenizer internal variable. */
101 // @formatter:off
102 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103 .setDelimiterMatcher(StrMatcher.tabMatcher())
104 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105 .setIgnoredMatcher(StrMatcher.noneMatcher())
106 .setTrimmerMatcher(StrMatcher.trimMatcher())
107 .setEmptyTokenAsNull(false)
108 .setIgnoreEmptyTokens(false);
109 // @formatter:on
110
111 /**
112 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113 *
114 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115 */
116 private static StrTokenizer getCSVClone() {
117 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118 }
119
120 /**
121 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
122 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
123 * <p>
124 * You must call a "reset" method to set the string which you want to parse.
125 * </p>
126 *
127 * @return a new tokenizer instance which parses Comma Separated Value strings.
128 */
129 public static StrTokenizer getCSVInstance() {
130 return getCSVClone();
131 }
132
133 /**
134 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
135 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
136 *
137 * @param input the text to parse.
138 * @return a new tokenizer instance which parses Comma Separated Value strings.
139 */
140 public static StrTokenizer getCSVInstance(final char[] input) {
141 final StrTokenizer tok = getCSVClone();
142 tok.reset(input);
143 return tok;
144 }
145
146 /**
147 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
148 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
149 *
150 * @param input the text to parse.
151 * @return a new tokenizer instance which parses Comma Separated Value strings.
152 */
153 public static StrTokenizer getCSVInstance(final String input) {
154 final StrTokenizer tok = getCSVClone();
155 tok.reset(input);
156 return tok;
157 }
158
159 /**
160 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
161 *
162 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
163 */
164 private static StrTokenizer getTSVClone() {
165 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
166 }
167
168 /**
169 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
170 * be overridden with the setTrimmer method).
171 * <p>
172 * You must call a "reset" method to set the string which you want to parse.
173 * </p>
174 *
175 * @return a new tokenizer instance which parses Tab Separated Value strings.
176 */
177 public static StrTokenizer getTSVInstance() {
178 return getTSVClone();
179 }
180
181 /**
182 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
183 * be overridden with the setTrimmer method).
184 *
185 * @param input the string to parse.
186 * @return a new tokenizer instance which parses Tab Separated Value strings.
187 */
188 public static StrTokenizer getTSVInstance(final char[] input) {
189 final StrTokenizer tok = getTSVClone();
190 tok.reset(input);
191 return tok;
192 }
193
194 /**
195 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
196 * be overridden with the setTrimmer method).
197 *
198 * @param input the string to parse.
199 * @return a new tokenizer instance which parses Tab Separated Value strings.
200 */
201 public static StrTokenizer getTSVInstance(final String input) {
202 final StrTokenizer tok = getTSVClone();
203 tok.reset(input);
204 return tok;
205 }
206
207 /** The text to work on. */
208 private char[] chars;
209
210 /** The parsed tokens. */
211 private String[] tokens;
212
213 /** The current iteration position. */
214 private int tokenPos;
215
216 /** The delimiter matcher. */
217 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
218
219 /** The quote matcher. */
220 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
221
222 /** The ignored matcher. */
223 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
224
225 /** The trimmer matcher. */
226 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
227
228 /** Whether to return empty tokens as null. */
229 private boolean emptyAsNull;
230
231 /** Whether to ignore empty tokens. */
232 private boolean ignoreEmptyTokens = true;
233
234 /**
235 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236 * <p>
237 * This constructor is normally used with {@link #reset(String)}.
238 * </p>
239 */
240 public StrTokenizer() {
241 this.chars = null;
242 }
243
244 /**
245 * Constructs a tokenizer splitting on space, tab, newline and form feed
246 * as per StringTokenizer.
247 *
248 * @param input the string which is to be parsed, not cloned.
249 */
250 public StrTokenizer(final char[] input) {
251 if (input == null) {
252 this.chars = null;
253 } else {
254 this.chars = input.clone();
255 }
256 }
257
258 /**
259 * Constructs a tokenizer splitting on the specified character.
260 *
261 * @param input the string which is to be parsed, not cloned.
262 * @param delim the field delimiter character.
263 */
264 public StrTokenizer(final char[] input, final char delim) {
265 this(input);
266 setDelimiterChar(delim);
267 }
268
269 /**
270 * Constructs a tokenizer splitting on the specified delimiter character
271 * and handling quotes using the specified quote character.
272 *
273 * @param input the string which is to be parsed, not cloned.
274 * @param delim the field delimiter character.
275 * @param quote the field quoted string character.
276 */
277 public StrTokenizer(final char[] input, final char delim, final char quote) {
278 this(input, delim);
279 setQuoteChar(quote);
280 }
281
282 /**
283 * Constructs a tokenizer splitting on the specified string.
284 *
285 * @param input the string which is to be parsed, not cloned.
286 * @param delim the field delimiter string.
287 */
288 public StrTokenizer(final char[] input, final String delim) {
289 this(input);
290 setDelimiterString(delim);
291 }
292
293 /**
294 * Constructs a tokenizer splitting using the specified delimiter matcher.
295 *
296 * @param input the string which is to be parsed, not cloned.
297 * @param delim the field delimiter matcher.
298 */
299 public StrTokenizer(final char[] input, final StrMatcher delim) {
300 this(input);
301 setDelimiterMatcher(delim);
302 }
303
304 /**
305 * Constructs a tokenizer splitting using the specified delimiter matcher
306 * and handling quotes using the specified quote matcher.
307 *
308 * @param input the string which is to be parsed, not cloned.
309 * @param delim the field delimiter character.
310 * @param quote the field quoted string character.
311 */
312 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
313 this(input, delim);
314 setQuoteMatcher(quote);
315 }
316
317 /**
318 * Constructs a tokenizer splitting on space, tab, newline and form feed
319 * as per StringTokenizer.
320 *
321 * @param input the string which is to be parsed.
322 */
323 public StrTokenizer(final String input) {
324 if (input != null) {
325 chars = input.toCharArray();
326 } else {
327 chars = null;
328 }
329 }
330
331 /**
332 * Constructs a tokenizer splitting on the specified delimiter character.
333 *
334 * @param input the string which is to be parsed.
335 * @param delim the field delimiter character.
336 */
337 public StrTokenizer(final String input, final char delim) {
338 this(input);
339 setDelimiterChar(delim);
340 }
341
342 /**
343 * Constructs a tokenizer splitting on the specified delimiter character
344 * and handling quotes using the specified quote character.
345 *
346 * @param input the string which is to be parsed.
347 * @param delim the field delimiter character.
348 * @param quote the field quoted string character.
349 */
350 public StrTokenizer(final String input, final char delim, final char quote) {
351 this(input, delim);
352 setQuoteChar(quote);
353 }
354
355 /**
356 * Constructs a tokenizer splitting on the specified delimiter string.
357 *
358 * @param input the string which is to be parsed.
359 * @param delim the field delimiter string.
360 */
361 public StrTokenizer(final String input, final String delim) {
362 this(input);
363 setDelimiterString(delim);
364 }
365
366 /**
367 * Constructs a tokenizer splitting using the specified delimiter matcher.
368 *
369 * @param input the string which is to be parsed.
370 * @param delim the field delimiter matcher.
371 */
372 public StrTokenizer(final String input, final StrMatcher delim) {
373 this(input);
374 setDelimiterMatcher(delim);
375 }
376
377 /**
378 * Constructs a tokenizer splitting using the specified delimiter matcher
379 * and handling quotes using the specified quote matcher.
380 *
381 * @param input the string which is to be parsed.
382 * @param delim the field delimiter matcher.
383 * @param quote the field quoted string matcher.
384 */
385 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
386 this(input, delim);
387 setQuoteMatcher(quote);
388 }
389
390 /**
391 * Unsupported ListIterator operation.
392 * @param obj this parameter ignored.
393 * @throws UnsupportedOperationException always.
394 */
395 @Override
396 public void add(final String obj) {
397 throw new UnsupportedOperationException("add() is unsupported");
398 }
399
400 /**
401 * Adds a token to a list, paying attention to the parameters we've set.
402 *
403 * @param list the list to add to.
404 * @param tok the token to add.
405 */
406 private void addToken(final List<String> list, String tok) {
407 if (tok == null || tok.isEmpty()) {
408 if (isIgnoreEmptyTokens()) {
409 return;
410 }
411 if (isEmptyTokenAsNull()) {
412 tok = null;
413 }
414 }
415 list.add(tok);
416 }
417
418 /**
419 * Checks if tokenization has been done, and if not then do it.
420 */
421 private void checkTokenized() {
422 if (tokens == null) {
423 if (chars == null) {
424 // still call tokenize as subclass may do some work
425 final List<String> split = tokenize(null, 0, 0);
426 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
427 } else {
428 final List<String> split = tokenize(chars, 0, chars.length);
429 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
430 }
431 }
432 }
433
434 /**
435 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
436 * {@link CloneNotSupportedException} is caught, return {@code null}.
437 *
438 * @return a new instance of this Tokenizer which has been reset.
439 */
440 @Override
441 public Object clone() {
442 try {
443 return cloneReset();
444 } catch (final CloneNotSupportedException ex) {
445 return null;
446 }
447 }
448
449 /**
450 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
451 *
452 * @return a new instance of this Tokenizer which has been reset.
453 * @throws CloneNotSupportedException if there is a problem cloning.
454 */
455 Object cloneReset() throws CloneNotSupportedException {
456 // this method exists to enable 100% test coverage
457 final StrTokenizer cloned = (StrTokenizer) super.clone();
458 if (cloned.chars != null) {
459 cloned.chars = cloned.chars.clone();
460 }
461 cloned.reset();
462 return cloned;
463 }
464
465 /**
466 * Gets the String content that the tokenizer is parsing.
467 *
468 * @return The string content being parsed.
469 */
470 public String getContent() {
471 if (chars == null) {
472 return null;
473 }
474 return new String(chars);
475 }
476
477 /**
478 * Gets the field delimiter matcher.
479 *
480 * @return The delimiter matcher in use.
481 */
482 public StrMatcher getDelimiterMatcher() {
483 return this.delimMatcher;
484 }
485
486 /**
487 * Gets the ignored character matcher.
488 * <p>
489 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
490 * </p>
491 *
492 * @return The ignored matcher in use.
493 */
494 public StrMatcher getIgnoredMatcher() {
495 return ignoredMatcher;
496 }
497
498 /**
499 * Gets the quote matcher currently in use.
500 * <p>
501 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
502 * </p>
503 *
504 * @return The quote matcher in use.
505 */
506 public StrMatcher getQuoteMatcher() {
507 return quoteMatcher;
508 }
509
510 /**
511 * Gets a copy of the full token list as an independent modifiable array.
512 *
513 * @return The tokens as a String array.
514 */
515 public String[] getTokenArray() {
516 checkTokenized();
517 return tokens.clone();
518 }
519
520 /**
521 * Gets a copy of the full token list as an independent modifiable list.
522 *
523 * @return The tokens as a String array.
524 */
525 public List<String> getTokenList() {
526 checkTokenized();
527 final List<String> list = new ArrayList<>(tokens.length);
528 Collections.addAll(list, tokens);
529
530 return list;
531 }
532
533 /**
534 * Gets the trimmer character matcher.
535 * <p>
536 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
537 * </p>
538 *
539 * @return The trimmer matcher in use.
540 */
541 public StrMatcher getTrimmerMatcher() {
542 return trimmerMatcher;
543 }
544
545 /**
546 * Checks whether there are any more tokens.
547 *
548 * @return true if there are more tokens.
549 */
550 @Override
551 public boolean hasNext() {
552 checkTokenized();
553 return tokenPos < tokens.length;
554 }
555
556 /**
557 * Checks whether there are any previous tokens that can be iterated to.
558 *
559 * @return true if there are previous tokens.
560 */
561 @Override
562 public boolean hasPrevious() {
563 checkTokenized();
564 return tokenPos > 0;
565 }
566
567 /**
568 * Gets whether the tokenizer currently returns empty tokens as null.
569 * The default for this property is false.
570 *
571 * @return true if empty tokens are returned as null.
572 */
573 public boolean isEmptyTokenAsNull() {
574 return this.emptyAsNull;
575 }
576
577 /**
578 * Gets whether the tokenizer currently ignores empty tokens.
579 * The default for this property is true.
580 *
581 * @return true if empty tokens are not returned.
582 */
583 public boolean isIgnoreEmptyTokens() {
584 return ignoreEmptyTokens;
585 }
586
587 /**
588 * Checks if the characters at the index specified match the quote
589 * already matched in readNextToken().
590 *
591 * @param srcChars the character array being tokenized.
592 * @param pos the position to check for a quote.
593 * @param len the length of the character array being tokenized.
594 * @param quoteStart the start position of the matched quote, 0 if no quoting.
595 * @param quoteLen the length of the matched quote, 0 if no quoting.
596 * @return true if a quote is matched.
597 */
598 private boolean isQuote(final char[] srcChars,
599 final int pos,
600 final int len,
601 final int quoteStart,
602 final int quoteLen) {
603 for (int i = 0; i < quoteLen; i++) {
604 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
605 return false;
606 }
607 }
608 return true;
609 }
610
611 /**
612 * Gets the next token.
613 *
614 * @return The next String token.
615 * @throws NoSuchElementException if there are no more elements.
616 */
617 @Override
618 public String next() {
619 if (hasNext()) {
620 return tokens[tokenPos++];
621 }
622 throw new NoSuchElementException();
623 }
624
625 /**
626 * Gets the index of the next token to return.
627 *
628 * @return The next token index.
629 */
630 @Override
631 public int nextIndex() {
632 return tokenPos;
633 }
634
635 /**
636 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
637 * tokens remain.
638 *
639 * @return The next sequential token, or null when no more tokens are found.
640 */
641 public String nextToken() {
642 if (hasNext()) {
643 return tokens[tokenPos++];
644 }
645 return null;
646 }
647
648 /**
649 * Gets the token previous to the last returned token.
650 *
651 * @return The previous token.
652 */
653 @Override
654 public String previous() {
655 if (hasPrevious()) {
656 return tokens[--tokenPos];
657 }
658 throw new NoSuchElementException();
659 }
660
661 /**
662 * Gets the index of the previous token.
663 *
664 * @return The previous token index.
665 */
666 @Override
667 public int previousIndex() {
668 return tokenPos - 1;
669 }
670
671 /**
672 * Gets the previous token from the String.
673 *
674 * @return The previous sequential token, or null when no more tokens are found.
675 */
676 public String previousToken() {
677 if (hasPrevious()) {
678 return tokens[--tokenPos];
679 }
680 return null;
681 }
682
683 /**
684 * Reads character by character through the String to get the next token.
685 *
686 * @param srcChars the character array being tokenized.
687 * @param start the first character of field.
688 * @param len the length of the character array being tokenized.
689 * @param workArea a temporary work area.
690 * @param tokenList the list of parsed tokens.
691 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
692 */
693 private int readNextToken(final char[] srcChars,
694 int start,
695 final int len,
696 final StrBuilder workArea,
697 final List<String> tokenList) {
698 // skip all leading whitespace, unless it is the
699 // field delimiter or the quote character
700 while (start < len) {
701 final int removeLen = Math.max(
702 getIgnoredMatcher().isMatch(srcChars, start, start, len),
703 getTrimmerMatcher().isMatch(srcChars, start, start, len));
704 if (removeLen == 0
705 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
706 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
707 break;
708 }
709 start += removeLen;
710 }
711
712 // handle reaching end
713 if (start >= len) {
714 addToken(tokenList, StringUtils.EMPTY);
715 return -1;
716 }
717
718 // handle empty token
719 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
720 if (delimLen > 0) {
721 addToken(tokenList, StringUtils.EMPTY);
722 return start + delimLen;
723 }
724
725 // handle found token
726 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
727 if (quoteLen > 0) {
728 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
729 }
730 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
731 }
732
733 /**
734 * Reads a possibly quoted string token.
735 *
736 * @param srcChars the character array being tokenized.
737 * @param start the first character of field.
738 * @param len the length of the character array being tokenized.
739 * @param workArea a temporary work area.
740 * @param tokenList the list of parsed tokens.
741 * @param quoteStart the start position of the matched quote, 0 if no quoting.
742 * @param quoteLen the length of the matched quote, 0 if no quoting.
743 * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
744 */
745 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
746 final List<String> tokenList, final int quoteStart, final int quoteLen) {
747 // Loop until we've found the end of the quoted
748 // string or the end of the input
749 workArea.clear();
750 int pos = start;
751 boolean quoting = quoteLen > 0;
752 int trimStart = 0;
753
754 while (pos < len) {
755 // quoting mode can occur several times throughout a string
756 // we must switch between quoting and non-quoting until we
757 // encounter a non-quoted delimiter, or end of string
758 if (quoting) {
759 // In quoting mode
760
761 // If we've found a quote character, see if it's
762 // followed by a second quote. If so, then we need
763 // to actually put the quote character into the token
764 // rather than end the token.
765 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
766 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
767 // matched pair of quotes, thus an escaped quote
768 workArea.append(srcChars, pos, quoteLen);
769 pos += quoteLen * 2;
770 trimStart = workArea.size();
771 continue;
772 }
773
774 // end of quoting
775 quoting = false;
776 pos += quoteLen;
777 continue;
778 }
779
780 } else {
781 // Not in quoting mode
782
783 // check for delimiter, and thus end of token
784 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
785 if (delimLen > 0) {
786 // return condition when end of token found
787 addToken(tokenList, workArea.substring(0, trimStart));
788 return pos + delimLen;
789 }
790
791 // check for quote, and thus back into quoting mode
792 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
793 quoting = true;
794 pos += quoteLen;
795 continue;
796 }
797
798 // check for ignored (outside quotes), and ignore
799 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
800 if (ignoredLen > 0) {
801 pos += ignoredLen;
802 continue;
803 }
804
805 // check for trimmed character
806 // don't yet know if its at the end, so copy to workArea
807 // use trimStart to keep track of trim at the end
808 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
809 if (trimmedLen > 0) {
810 workArea.append(srcChars, pos, trimmedLen);
811 pos += trimmedLen;
812 continue;
813 }
814
815 }
816 // copy regular character from inside quotes
817 workArea.append(srcChars[pos++]);
818 trimStart = workArea.size();
819 }
820
821 // return condition when end of string found
822 addToken(tokenList, workArea.substring(0, trimStart));
823 return -1;
824 }
825
826 /**
827 * Unsupported ListIterator operation.
828 *
829 * @throws UnsupportedOperationException always.
830 */
831 @Override
832 public void remove() {
833 throw new UnsupportedOperationException("remove() is unsupported");
834 }
835
836 /**
837 * Resets this tokenizer, forgetting all parsing and iteration already completed.
838 * <p>
839 * This method allows the same tokenizer to be reused for the same String.
840 * </p>
841 *
842 * @return {@code this} instance.
843 */
844 public StrTokenizer reset() {
845 tokenPos = 0;
846 tokens = null;
847 return this;
848 }
849
850 /**
851 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
852 *
853 * @param input the new character array to tokenize, not cloned, null sets no text to parse.
854 * @return {@code this} instance.
855 */
856 public StrTokenizer reset(final char[] input) {
857 reset();
858 if (input != null) {
859 this.chars = input.clone();
860 } else {
861 this.chars = null;
862 }
863 return this;
864 }
865
866 /**
867 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
868 *
869 * @param input the new string to tokenize, null sets no text to parse.
870 * @return {@code this} instance.
871 */
872 public StrTokenizer reset(final String input) {
873 reset();
874 if (input != null) {
875 this.chars = input.toCharArray();
876 } else {
877 this.chars = null;
878 }
879 return this;
880 }
881
882 /**
883 * Unsupported ListIterator operation.
884 *
885 * @param obj this parameter ignored.
886 * @throws UnsupportedOperationException Always thrown.
887 */
888 @Override
889 public void set(final String obj) {
890 throw new UnsupportedOperationException("set() is unsupported");
891 }
892
893 /**
894 * Sets the field delimiter character.
895 *
896 * @param delim the delimiter character to use.
897 * @return {@code this} instance.
898 */
899 public StrTokenizer setDelimiterChar(final char delim) {
900 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
901 }
902
903 /**
904 * Sets the field delimiter matcher.
905 * <p>
906 * The delimiter is used to separate one token from another.
907 * </p>
908 *
909 * @param delim the delimiter matcher to use.
910 * @return {@code this} instance.
911 */
912 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
913 if (delim == null) {
914 this.delimMatcher = StrMatcher.noneMatcher();
915 } else {
916 this.delimMatcher = delim;
917 }
918 return this;
919 }
920
921 /**
922 * Sets the field delimiter string.
923 *
924 * @param delim the delimiter string to use.
925 * @return {@code this} instance.
926 */
927 public StrTokenizer setDelimiterString(final String delim) {
928 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
929 }
930
931 /**
932 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
933 *
934 * @param emptyAsNull whether empty tokens are returned as null.
935 * @return {@code this} instance.
936 */
937 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
938 this.emptyAsNull = emptyAsNull;
939 return this;
940 }
941
942 /**
943 * Sets the character to ignore.
944 * <p>
945 * This character is ignored when parsing the String, unless it is within a quoted region.
946 * </p>
947 *
948 * @param ignored the ignored character to use.
949 * @return {@code this} instance.
950 */
951 public StrTokenizer setIgnoredChar(final char ignored) {
952 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
953 }
954
955 /**
956 * Sets the matcher for characters to ignore.
957 * <p>
958 * These characters are ignored when parsing the String, unless they are within a quoted region.
959 * </p>
960 *
961 * @param ignored the ignored matcher to use, null ignored.
962 * @return {@code this} instance.
963 */
964 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
965 if (ignored != null) {
966 this.ignoredMatcher = ignored;
967 }
968 return this;
969 }
970
971 /**
972 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
973 *
974 * @param ignoreEmptyTokens whether empty tokens are not returned.
975 * @return {@code this} instance.
976 */
977 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
978 this.ignoreEmptyTokens = ignoreEmptyTokens;
979 return this;
980 }
981
982 /**
983 * Sets the quote character to use.
984 * <p>
985 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
986 * </p>
987 *
988 * @param quote the quote character to use.
989 * @return {@code this} instance.
990 */
991 public StrTokenizer setQuoteChar(final char quote) {
992 return setQuoteMatcher(StrMatcher.charMatcher(quote));
993 }
994
995 /**
996 * Sets the quote matcher to use.
997 * <p>
998 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
999 * </p>
1000 *
1001 * @param quote the quote matcher to use, null ignored.
1002 * @return {@code this} instance.
1003 */
1004 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1005 if (quote != null) {
1006 this.quoteMatcher = quote;
1007 }
1008 return this;
1009 }
1010
1011 /**
1012 * Sets the matcher for characters to trim.
1013 * <p>
1014 * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1015 * </p>
1016 *
1017 * @param trimmer the trimmer matcher to use, null ignored
1018 * @return {@code this} instance.
1019 */
1020 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1021 if (trimmer != null) {
1022 this.trimmerMatcher = trimmer;
1023 }
1024 return this;
1025 }
1026
1027 /**
1028 * Gets the number of tokens found in the String.
1029 *
1030 * @return The number of matched tokens.
1031 */
1032 public int size() {
1033 checkTokenized();
1034 return tokens.length;
1035 }
1036
1037 /**
1038 * Internal method to performs the tokenization.
1039 * <p>
1040 * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
1041 * </p>
1042 * <p>
1043 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1044 * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1045 * </p>
1046 * <p>
1047 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1048 * values, or even an entirely different array.
1049 * </p>
1050 *
1051 * @param srcChars the character array being tokenized, may be null.
1052 * @param offset the start position within the character array, must be valid.
1053 * @param count the number of characters to tokenize, must be valid.
1054 * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1055 */
1056 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1057 if (srcChars == null || count == 0) {
1058 return Collections.emptyList();
1059 }
1060 final StrBuilder buf = new StrBuilder();
1061 final List<String> tokenList = new ArrayList<>();
1062 int pos = offset;
1063
1064 // loop around the entire buffer
1065 while (pos >= 0 && pos < count) {
1066 // find next token
1067 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1068
1069 // handle case where end of string is a delimiter
1070 if (pos >= count) {
1071 addToken(tokenList, StringUtils.EMPTY);
1072 }
1073 }
1074 return tokenList;
1075 }
1076
1077 /**
1078 * Gets the String content that the tokenizer is parsing.
1079 *
1080 * @return The string content being parsed.
1081 */
1082 @Override
1083 public String toString() {
1084 if (tokens == null) {
1085 return "StrTokenizer[not tokenized yet]";
1086 }
1087 return "StrTokenizer" + getTokenList();
1088 }
1089
1090 }