View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  
26  import org.apache.commons.lang3.ArrayUtils;
27  import org.apache.commons.lang3.StringUtils;
28  import org.apache.commons.text.matcher.StringMatcher;
29  import org.apache.commons.text.matcher.StringMatcherFactory;
30  
31  /**
32   * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims to do a similar job to
35   * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
36   * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
39   * <em>delimiter</em>. One or more delimiter characters must be specified.
40   * <p>
41   * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
42   * escaped within a quoted section by duplicating itself.
43   * <p>
44   * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
45   * specifies these characters. One usage might be to trim whitespace characters.
46   * <p>
47   * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
48   * these characters to be removed. One usage might be to remove new line characters.
49   * <p>
50   * Empty tokens may be removed or returned as null.
51   *
52   * <pre>
53   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
54   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
55   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56   * </pre>
57   *
58   * <table>
59   * <caption>StringTokenizer properties and options</caption>
60   * <tr>
61   * <th>Property</th>
62   * <th>Type</th>
63   * <th>Default</th>
64   * </tr>
65   * <tr>
66   * <td>delim</td>
67   * <td>CharSetMatcher</td>
68   * <td>{ \t\n\r\f}</td>
69   * </tr>
70   * <tr>
71   * <td>quote</td>
72   * <td>NoneMatcher</td>
73   * <td>{}</td>
74   * </tr>
75   * <tr>
76   * <td>ignore</td>
77   * <td>NoneMatcher</td>
78   * <td>{}</td>
79   * </tr>
80   * <tr>
81   * <td>emptyTokenAsNull</td>
82   * <td>boolean</td>
83   * <td>false</td>
84   * </tr>
85   * <tr>
86   * <td>ignoreEmptyTokens</td>
87   * <td>boolean</td>
88   * <td>true</td>
89   * </tr>
90   * </table>
91   *
92   * @since 1.3
93   */
94  public class StringTokenizer implements ListIterator<String>, Cloneable {
95  
96      /** Comma separated values tokenizer internal variable. */
97      // @formatter:off
98      private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
99              .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher())
100             .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
101             .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
102             .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
103             .setEmptyTokenAsNull(false)
104             .setIgnoreEmptyTokens(false);
105     // @formatter:on
106 
107     /** Tab separated values tokenizer internal variable. */
108     // @formatter:off
109     private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
110             .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher())
111             .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
112             .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
113             .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
114             .setEmptyTokenAsNull(false)
115             .setIgnoreEmptyTokens(false);
116     // @formatter:on
117 
118     /**
119      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
120      *
121      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122      */
123     private static StringTokenizer getCSVClone() {
124         return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
125     }
126 
127     /**
128      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
129      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
130      * setTrimmer method).
131      * <p>
132      * You must call a "reset" method to set the string which you want to parse.
133      * </p>
134      *
135      * @return a new tokenizer instance which parses Comma Separated Value strings.
136      */
137     public static StringTokenizer getCSVInstance() {
138         return getCSVClone();
139     }
140 
141     /**
142      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
143      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
144      *
145      * @param input the text to parse.
146      * @return a new tokenizer instance which parses Comma Separated Value strings.
147      */
148     public static StringTokenizer getCSVInstance(final char[] input) {
149         return getCSVClone().reset(input);
150     }
151 
152     /**
153      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
154      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
155      *
156      * @param input the text to parse.
157      * @return a new tokenizer instance which parses Comma Separated Value strings.
158      */
159     public static StringTokenizer getCSVInstance(final String input) {
160         return getCSVClone().reset(input);
161     }
162 
163     /**
164      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
165      *
166      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167      */
168     private static StringTokenizer getTSVClone() {
169         return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
170     }
171 
172     /**
173      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
174      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
175      * <p>
176      * You must call a "reset" method to set the string which you want to parse.
177      * </p>
178      *
179      * @return a new tokenizer instance which parses Tab Separated Value strings.
180      */
181     public static StringTokenizer getTSVInstance() {
182         return getTSVClone();
183     }
184 
185     /**
186      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
187      * be overridden with the setTrimmer method).
188      *
189      * @param input the string to parse.
190      * @return a new tokenizer instance which parses Tab Separated Value strings.
191      */
192     public static StringTokenizer getTSVInstance(final char[] input) {
193         return getTSVClone().reset(input);
194     }
195 
196     /**
197      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
198      * be overridden with the setTrimmer method).
199      *
200      * @param input the string to parse.
201      * @return a new tokenizer instance which parses Tab Separated Value strings.
202      */
203     public static StringTokenizer getTSVInstance(final String input) {
204         return getTSVClone().reset(input);
205     }
206 
207     /** The text to work on. */
208     private char[] chars;
209 
210     /** The parsed tokens. */
211     private String[] tokens;
212 
213     /** The current iteration position. */
214     private int tokenPos;
215 
216     /** The delimiter matcher. */
217     private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
218 
219     /** The quote matcher. */
220     private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
221 
222     /** The ignored matcher. */
223     private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
224 
225     /** The trimmer matcher. */
226     private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
227 
228     /** Whether to return empty tokens as null. */
229     private boolean emptyAsNull;
230 
231     /** Whether to ignore empty tokens. */
232     private boolean ignoreEmptyTokens = true;
233 
234     /**
235      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236      * <p>
237      * This constructor is normally used with {@link #reset(String)}.
238      * </p>
239      */
240     public StringTokenizer() {
241         this.chars = null;
242     }
243 
244     /**
245      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
246      *
247      * @param input the string which is to be parsed, not cloned.
248      */
249     public StringTokenizer(final char[] input) {
250         this.chars = input != null ? input.clone() : null;
251     }
252 
253     /**
254      * Constructs a tokenizer splitting on the specified character.
255      *
256      * @param input the string which is to be parsed, not cloned.
257      * @param delim the field delimiter character.
258      */
259     public StringTokenizer(final char[] input, final char delim) {
260         this(input);
261         setDelimiterChar(delim);
262     }
263 
264     /**
265      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character.
266      *
267      * @param input the string which is to be parsed, not cloned.
268      * @param delim the field delimiter character.
269      * @param quote the field quoted string character.
270      */
271     public StringTokenizer(final char[] input, final char delim, final char quote) {
272         this(input, delim);
273         setQuoteChar(quote);
274     }
275 
276     /**
277      * Constructs a tokenizer splitting on the specified string.
278      *
279      * @param input the string which is to be parsed, not cloned.
280      * @param delim the field delimiter string.
281      */
282     public StringTokenizer(final char[] input, final String delim) {
283         this(input);
284         setDelimiterString(delim);
285     }
286 
287     /**
288      * Constructs a tokenizer splitting using the specified delimiter matcher.
289      *
290      * @param input the string which is to be parsed, not cloned.
291      * @param delim the field delimiter matcher.
292      */
293     public StringTokenizer(final char[] input, final StringMatcher delim) {
294         this(input);
295         setDelimiterMatcher(delim);
296     }
297 
298     /**
299      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher.
300      *
301      * @param input the string which is to be parsed, not cloned.
302      * @param delim the field delimiter character.
303      * @param quote the field quoted string character.
304      */
305     public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
306         this(input, delim);
307         setQuoteMatcher(quote);
308     }
309 
310     /**
311      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
312      *
313      * @param input the string which is to be parsed.
314      */
315     public StringTokenizer(final String input) {
316         this.chars = input != null ? input.toCharArray() : null;
317     }
318 
319     /**
320      * Constructs a tokenizer splitting on the specified delimiter character.
321      *
322      * @param input the string which is to be parsed.
323      * @param delim the field delimiter character.
324      */
325     public StringTokenizer(final String input, final char delim) {
326         this(input);
327         setDelimiterChar(delim);
328     }
329 
330     /**
331      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character.
332      *
333      * @param input the string which is to be parsed.
334      * @param delim the field delimiter character.
335      * @param quote the field quoted string character.
336      */
337     public StringTokenizer(final String input, final char delim, final char quote) {
338         this(input, delim);
339         setQuoteChar(quote);
340     }
341 
342     /**
343      * Constructs a tokenizer splitting on the specified delimiter string.
344      *
345      * @param input the string which is to be parsed.
346      * @param delim the field delimiter string.
347      */
348     public StringTokenizer(final String input, final String delim) {
349         this(input);
350         setDelimiterString(delim);
351     }
352 
353     /**
354      * Constructs a tokenizer splitting using the specified delimiter matcher.
355      *
356      * @param input the string which is to be parsed.
357      * @param delim the field delimiter matcher.
358      */
359     public StringTokenizer(final String input, final StringMatcher delim) {
360         this(input);
361         setDelimiterMatcher(delim);
362     }
363 
364     /**
365      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher.
366      *
367      * @param input the string which is to be parsed.
368      * @param delim the field delimiter matcher.
369      * @param quote the field quoted string matcher.
370      */
371     public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
372         this(input, delim);
373         setQuoteMatcher(quote);
374     }
375 
376     /**
377      * Unsupported ListIterator operation.
378      *
379      * @param obj this parameter ignored.
380      * @throws UnsupportedOperationException always.
381      */
382     @Override
383     public void add(final String obj) {
384         throw new UnsupportedOperationException("add() is unsupported");
385     }
386 
387     /**
388      * Adds a token to a list, paying attention to the parameters we've set.
389      *
390      * @param list the list to add to.
391      * @param tok  the token to add.
392      */
393     private void addToken(final List<String> list, String tok) {
394         if (tok == null || tok.isEmpty()) {
395             if (isIgnoreEmptyTokens()) {
396                 return;
397             }
398             if (isEmptyTokenAsNull()) {
399                 tok = null;
400             }
401         }
402         list.add(tok);
403     }
404 
405     /**
406      * Checks if tokenization has been done, and if not then do it.
407      */
408     private void checkTokenized() {
409         if (tokens == null) {
410             final List<String> split;
411             if (chars == null) {
412                 // still call tokenize as subclass may do some work.
413                 split = tokenize(null, 0, 0);
414             } else {
415                 split = tokenize(chars, 0, chars.length);
416             }
417             tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
418         }
419     }
420 
421     /**
422      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
423      * {@link CloneNotSupportedException} is caught, return {@code null}.
424      *
425      * @return a new instance of this Tokenizer which has been reset.
426      */
427     @Override
428     public Object clone() {
429         try {
430             return cloneReset();
431         } catch (final CloneNotSupportedException ex) {
432             return null;
433         }
434     }
435 
436     /**
437      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
438      *
439      * @return a new instance of this Tokenizer which has been reset.
440      * @throws CloneNotSupportedException if there is a problem cloning.
441      */
442     Object cloneReset() throws CloneNotSupportedException {
443         // this method exists to enable 100% test coverage
444         final StringTokenizer cloned = (StringTokenizer) super.clone();
445         if (cloned.chars != null) {
446             cloned.chars = cloned.chars.clone();
447         }
448         cloned.reset();
449         return cloned;
450     }
451 
452     /**
453      * Gets the String content that the tokenizer is parsing.
454      *
455      * @return The string content being parsed.
456      */
457     public String getContent() {
458         if (chars == null) {
459             return null;
460         }
461         return new String(chars);
462     }
463 
464     /**
465      * Gets the field delimiter matcher.
466      *
467      * @return The delimiter matcher in use.
468      */
469     public StringMatcher getDelimiterMatcher() {
470         return this.delimMatcher;
471     }
472 
473     /**
474      * Gets the ignored character matcher.
475      * <p>
476      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
477      * </p>
478      *
479      * @return The ignored matcher in use.
480      */
481     public StringMatcher getIgnoredMatcher() {
482         return ignoredMatcher;
483     }
484 
485     /**
486      * Gets the quote matcher currently in use.
487      * <p>
488      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
489      * </p>
490      *
491      * @return The quote matcher in use.
492      */
493     public StringMatcher getQuoteMatcher() {
494         return quoteMatcher;
495     }
496 
497     /**
498      * Gets a copy of the full token list as an independent modifiable array.
499      *
500      * @return The tokens as a String array.
501      */
502     public String[] getTokenArray() {
503         checkTokenized();
504         return tokens.clone();
505     }
506 
507     /**
508      * Gets a copy of the full token list as an independent modifiable list.
509      *
510      * @return The tokens as a String list.
511      */
512     public List<String> getTokenList() {
513         checkTokenized();
514         return new ArrayList<>(Arrays.asList(tokens));
515     }
516 
517     /**
518      * Gets the trimmer character matcher.
519      * <p>
520      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
521      * </p>
522      *
523      * @return The trimmer matcher in use.
524      */
525     public StringMatcher getTrimmerMatcher() {
526         return trimmerMatcher;
527     }
528 
529     /**
530      * Tests whether there are any more tokens.
531      *
532      * @return true if there are more tokens.
533      */
534     @Override
535     public boolean hasNext() {
536         checkTokenized();
537         return tokenPos < tokens.length;
538     }
539 
540     /**
541      * Tests whether there are any previous tokens that can be iterated to.
542      *
543      * @return true if there are previous tokens.
544      */
545     @Override
546     public boolean hasPrevious() {
547         checkTokenized();
548         return tokenPos > 0;
549     }
550 
551     /**
552      * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
553      *
554      * @return true if empty tokens are returned as null.
555      */
556     public boolean isEmptyTokenAsNull() {
557         return this.emptyAsNull;
558     }
559 
560     /**
561      * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
562      *
563      * @return true if empty tokens are not returned.
564      */
565     public boolean isIgnoreEmptyTokens() {
566         return ignoreEmptyTokens;
567     }
568 
569     /**
570      * Tests if the characters at the index specified match the quote already matched in readNextToken().
571      *
572      * @param srcChars   the character array being tokenized.
573      * @param pos        the position to check for a quote.
574      * @param len        the length of the character array being tokenized.
575      * @param quoteStart the start position of the matched quote, 0 if no quoting.
576      * @param quoteLen   the length of the matched quote, 0 if no quoting.
577      * @return true if a quote is matched.
578      */
579     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
580         for (int i = 0; i < quoteLen; i++) {
581             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
582                 return false;
583             }
584         }
585         return true;
586     }
587 
588     /**
589      * Gets the next token.
590      *
591      * @return The next String token.
592      * @throws NoSuchElementException if there are no more elements.
593      */
594     @Override
595     public String next() {
596         if (hasNext()) {
597             return tokens[tokenPos++];
598         }
599         throw new NoSuchElementException();
600     }
601 
602     /**
603      * Gets the index of the next token to return.
604      *
605      * @return The next token index.
606      */
607     @Override
608     public int nextIndex() {
609         return tokenPos;
610     }
611 
612     /**
613      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
614      * tokens remain.
615      *
616      * @return The next sequential token, or null when no more tokens are found.
617      */
618     public String nextToken() {
619         if (hasNext()) {
620             return tokens[tokenPos++];
621         }
622         return null;
623     }
624 
625     /**
626      * Gets the token previous to the last returned token.
627      *
628      * @return The previous token.
629      */
630     @Override
631     public String previous() {
632         if (hasPrevious()) {
633             return tokens[--tokenPos];
634         }
635         throw new NoSuchElementException();
636     }
637 
638     /**
639      * Gets the index of the previous token.
640      *
641      * @return The previous token index.
642      */
643     @Override
644     public int previousIndex() {
645         return tokenPos - 1;
646     }
647 
648     /**
649      * Gets the previous token from the String.
650      *
651      * @return The previous sequential token, or null when no more tokens are found.
652      */
653     public String previousToken() {
654         if (hasPrevious()) {
655             return tokens[--tokenPos];
656         }
657         return null;
658     }
659 
660     /**
661      * Reads character by character through the String to get the next token.
662      *
663      * @param srcChars  the character array being tokenized.
664      * @param start     the first character of field.
665      * @param len       the length of the character array being tokenized.
666      * @param workArea  a temporary work area.
667      * @param tokenList the list of parsed tokens.
668      * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
669      */
670     private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
671             final List<String> tokenList) {
672         // skip all leading whitespace, unless it is the
673         // field delimiter or the quote character
674         while (start < len) {
675             final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
676                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
677             if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
678                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
679                 break;
680             }
681             start += removeLen;
682         }
683 
684         // handle reaching end
685         if (start >= len) {
686             addToken(tokenList, StringUtils.EMPTY);
687             return -1;
688         }
689 
690         // handle empty token
691         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
692         if (delimLen > 0) {
693             addToken(tokenList, StringUtils.EMPTY);
694             return start + delimLen;
695         }
696 
697         // handle found token
698         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
699         if (quoteLen > 0) {
700             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
701         }
702         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
703     }
704 
705     /**
706      * Reads a possibly quoted string token.
707      *
708      * @param srcChars   the character array being tokenized.
709      * @param start      the first character of field.
710      * @param len        the length of the character array being tokenized.
711      * @param workArea   a temporary work area.
712      * @param tokenList  the list of parsed tokens.
713      * @param quoteStart the start position of the matched quote, 0 if no quoting.
714      * @param quoteLen   the length of the matched quote, 0 if no quoting.
715      * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
716      */
717     private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
718             final List<String> tokenList, final int quoteStart, final int quoteLen) {
719         // Loop until we've found the end of the quoted
720         // string or the end of the input
721         workArea.clear();
722         int pos = start;
723         boolean quoting = quoteLen > 0;
724         int trimStart = 0;
725 
726         while (pos < len) {
727             // quoting mode can occur several times throughout a string
728             // we must switch between quoting and non-quoting until we
729             // encounter a non-quoted delimiter, or end of string
730             if (quoting) {
731                 // In quoting mode
732 
733                 // If we've found a quote character, see if it's
734                 // followed by a second quote. If so, then we need
735                 // to actually put the quote character into the token
736                 // rather than end the token.
737                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
738                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
739                         // matched pair of quotes, thus an escaped quote
740                         workArea.append(srcChars, pos, quoteLen);
741                         pos += quoteLen * 2;
742                         trimStart = workArea.size();
743                         continue;
744                     }
745 
746                     // end of quoting
747                     quoting = false;
748                     pos += quoteLen;
749                     continue;
750                 }
751 
752             } else {
753                 // Not in quoting mode
754 
755                 // check for delimiter, and thus end of token
756                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
757                 if (delimLen > 0) {
758                     // return condition when end of token found
759                     addToken(tokenList, workArea.substring(0, trimStart));
760                     return pos + delimLen;
761                 }
762 
763                 // check for quote, and thus back into quoting mode
764                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
765                     quoting = true;
766                     pos += quoteLen;
767                     continue;
768                 }
769 
770                 // check for ignored (outside quotes), and ignore
771                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
772                 if (ignoredLen > 0) {
773                     pos += ignoredLen;
774                     continue;
775                 }
776 
777                 // check for trimmed character
778                 // don't yet know if its at the end, so copy to workArea
779                 // use trimStart to keep track of trim at the end
780                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
781                 if (trimmedLen > 0) {
782                     workArea.append(srcChars, pos, trimmedLen);
783                     pos += trimmedLen;
784                     continue;
785                 }
786             }
787             // copy regular character from inside quotes
788             workArea.append(srcChars[pos++]);
789             trimStart = workArea.size();
790         }
791 
792         // return condition when end of string found
793         addToken(tokenList, workArea.substring(0, trimStart));
794         return -1;
795     }
796 
797     /**
798      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
799      *
800      * @throws UnsupportedOperationException Always thrown.
801      */
802     @Override
803     public void remove() {
804         throw new UnsupportedOperationException("remove() is unsupported");
805     }
806 
807     /**
808      * Resets this tokenizer, forgetting all parsing and iteration already completed.
809      * <p>
810      * This method allows the same tokenizer to be reused for the same String.
811      * </p>
812      *
813      * @return {@code this} instance.
814      */
815     public StringTokenizer reset() {
816         tokenPos = 0;
817         tokens = null;
818         return this;
819     }
820 
821     /**
822      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
823      *
824      * @param input the new character array to tokenize, not cloned, null sets no text to parse.
825      * @return {@code this} instance.
826      */
827     public StringTokenizer reset(final char[] input) {
828         reset();
829         this.chars = input != null ? input.clone() : null;
830         return this;
831     }
832 
833     /**
834      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
835      *
836      * @param input the new string to tokenize, null sets no text to parse.
837      * @return {@code this} instance.
838      */
839     public StringTokenizer reset(final String input) {
840         reset();
841         this.chars = input != null ? input.toCharArray() : null;
842         return this;
843     }
844 
845     /**
846      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
847      *
848      * @param obj this parameter ignored.
849      * @throws UnsupportedOperationException always.
850      */
851     @Override
852     public void set(final String obj) {
853         throw new UnsupportedOperationException("set() is unsupported");
854     }
855 
856     /**
857      * Sets the field delimiter character.
858      *
859      * @param delim the delimiter character to use.
860      * @return {@code this} instance.
861      */
862     public StringTokenizer setDelimiterChar(final char delim) {
863         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
864     }
865 
866     /**
867      * Sets the field delimiter matcher.
868      * <p>
869      * The delimiter is used to separate one token from another.
870      * </p>
871      *
872      * @param delim the delimiter matcher to use.
873      * @return {@code this} instance.
874      */
875     public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
876         this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
877         return this;
878     }
879 
880     /**
881      * Sets the field delimiter string.
882      *
883      * @param delim the delimiter string to use.
884      * @return {@code this} instance.
885      */
886     public StringTokenizer setDelimiterString(final String delim) {
887         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
888     }
889 
890     /**
891      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
892      *
893      * @param emptyAsNull whether empty tokens are returned as null.
894      * @return {@code this} instance.
895      */
896     public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
897         this.emptyAsNull = emptyAsNull;
898         return this;
899     }
900 
901     /**
902      * Sets the character to ignore.
903      * <p>
904      * This character is ignored when parsing the String, unless it is within a quoted region.
905      * </p>
906      *
907      * @param ignored the ignored character to use.
908      * @return {@code this} instance.
909      */
910     public StringTokenizer setIgnoredChar(final char ignored) {
911         return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
912     }
913 
914     /**
915      * Sets the matcher for characters to ignore.
916      * <p>
917      * These characters are ignored when parsing the String, unless they are within a quoted region.
918      * </p>
919      *
920      * @param ignored the ignored matcher to use, null ignored.
921      * @return {@code this} instance.
922      */
923     public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
924         if (ignored != null) {
925             this.ignoredMatcher = ignored;
926         }
927         return this;
928     }
929 
930     /**
931      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
932      *
933      * @param ignoreEmptyTokens whether empty tokens are not returned.
934      * @return {@code this} instance.
935      */
936     public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
937         this.ignoreEmptyTokens = ignoreEmptyTokens;
938         return this;
939     }
940 
941     /**
942      * Sets the quote character to use.
943      * <p>
944      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
945      * </p>
946      *
947      * @param quote the quote character to use.
948      * @return {@code this} instance.
949      */
950     public StringTokenizer setQuoteChar(final char quote) {
951         return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
952     }
953 
954     /**
955      * Sets the quote matcher to use.
956      * <p>
957      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
958      * </p>
959      *
960      * @param quote the quote matcher to use, null ignored.
961      * @return {@code this} instance.
962      */
963     public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
964         if (quote != null) {
965             this.quoteMatcher = quote;
966         }
967         return this;
968     }
969 
970     /**
971      * Sets the matcher for characters to trim.
972      * <p>
973      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
974      *
975      * @param trimmer the trimmer matcher to use, null ignored.
976      * @return {@code this} instance.
977      */
978     public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
979         if (trimmer != null) {
980             this.trimmerMatcher = trimmer;
981         }
982         return this;
983     }
984 
985     /**
986      * Gets the number of tokens found in the String.
987      *
988      * @return The number of matched tokens.
989      */
990     public int size() {
991         checkTokenized();
992         return tokens.length;
993     }
994 
995     /**
996      * Internal method to performs the tokenization.
997      * <p>
998      * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
999      * </p>
1000      * <p>
1001      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1002      * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1003      * </p>
1004      * <p>
1005      * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1006      * values, or even an entirely different array.
1007      * </p>
1008      *
1009      * @param srcChars the character array being tokenized, may be null.
1010      * @param offset   the start position within the character array, must be valid.
1011      * @param count    the number of characters to tokenize, must be valid.
1012      * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1013      */
1014     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1015         if (srcChars == null || count == 0) {
1016             return Collections.emptyList();
1017         }
1018         final TextStringBuilder buf = new TextStringBuilder();
1019         final List<String> tokenList = new ArrayList<>();
1020         int pos = offset;
1021         // loop around the entire buffer
1022         while (pos >= 0 && pos < count) {
1023             // find next token
1024             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1025             // handle case where end of string is a delimiter
1026             if (pos >= count) {
1027                 addToken(tokenList, StringUtils.EMPTY);
1028             }
1029         }
1030         return tokenList;
1031     }
1032 
1033     /**
1034      * Gets the String content that the tokenizer is parsing.
1035      *
1036      * @return The string content being parsed.
1037      */
1038     @Override
1039     public String toString() {
1040         if (tokens == null) {
1041             return "StringTokenizer[not tokenized yet]";
1042         }
1043         return "StringTokenizer" + getTokenList();
1044     }
1045 }