View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  
26  import org.apache.commons.lang3.ArrayUtils;
27  import org.apache.commons.lang3.StringUtils;
28  import org.apache.commons.text.matcher.StringMatcher;
29  import org.apache.commons.text.matcher.StringMatcherFactory;
30  
31  /**
32   * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims to do a similar job to
35   * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
36   * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
39   * <em>delimiter</em>. One or more delimiter characters must be specified.
40   * <p>
41   * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
42   * escaped within a quoted section by duplicating itself.
43   * <p>
44   * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
45   * specifies these characters. One usage might be to trim whitespace characters.
46   * <p>
47   * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
48   * these characters to be removed. One usage might be to remove new line characters.
49   * <p>
50   * Empty tokens may be removed or returned as null.
51   *
52   * <pre>
53   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
54   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
55   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56   * </pre>
57   *
58   * <table>
59   * <caption>StringTokenizer properties and options</caption>
60   * <tr>
61   * <th>Property</th>
62   * <th>Type</th>
63   * <th>Default</th>
64   * </tr>
65   * <tr>
66   * <td>delim</td>
67   * <td>CharSetMatcher</td>
68   * <td>{ \t\n\r\f}</td>
69   * </tr>
70   * <tr>
71   * <td>quote</td>
72   * <td>NoneMatcher</td>
73   * <td>{}</td>
74   * </tr>
75   * <tr>
76   * <td>ignore</td>
77   * <td>NoneMatcher</td>
78   * <td>{}</td>
79   * </tr>
80   * <tr>
81   * <td>emptyTokenAsNull</td>
82   * <td>boolean</td>
83   * <td>false</td>
84   * </tr>
85   * <tr>
86   * <td>ignoreEmptyTokens</td>
87   * <td>boolean</td>
88   * <td>true</td>
89   * </tr>
90   * </table>
91   *
92   * @since 1.3
93   */
94  public class StringTokenizer implements ListIterator<String>, Cloneable {
95  
96      /** Comma separated values tokenizer internal variable. */
97      // @formatter:off
98      private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
99              .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher())
100             .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
101             .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
102             .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
103             .setEmptyTokenAsNull(false)
104             .setIgnoreEmptyTokens(false);
105     // @formatter:on
106 
107     /** Tab separated values tokenizer internal variable. */
108     // @formatter:off
109     private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
110             .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher())
111             .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
112             .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
113             .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
114             .setEmptyTokenAsNull(false)
115             .setIgnoreEmptyTokens(false);
116     // @formatter:on
117 
118     /**
119      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
120      *
121      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122      */
123     private static StringTokenizer getCSVClone() {
124         return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
125     }
126 
127     /**
128      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
129      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
130      * setTrimmer method).
131      * <p>
132      * You must call a "reset" method to set the string which you want to parse.
133      * </p>
134      *
135      * @return a new tokenizer instance which parses Comma Separated Value strings
136      */
137     public static StringTokenizer getCSVInstance() {
138         return getCSVClone();
139     }
140 
141     /**
142      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
143      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
144      * setTrimmer method).
145      *
146      * @param input
147      *            the text to parse
148      * @return a new tokenizer instance which parses Comma Separated Value strings
149      */
150     public static StringTokenizer getCSVInstance(final char[] input) {
151         return getCSVClone().reset(input);
152     }
153 
154     /**
155      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
156      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
157      * setTrimmer method).
158      *
159      * @param input
160      *            the text to parse
161      * @return a new tokenizer instance which parses Comma Separated Value strings
162      */
163     public static StringTokenizer getCSVInstance(final String input) {
164         return getCSVClone().reset(input);
165     }
166 
167     /**
168      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
169      *
170      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171      */
172     private static StringTokenizer getTSVClone() {
173         return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
174     }
175 
176     /**
177      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
178      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
179      * <p>
180      * You must call a "reset" method to set the string which you want to parse.
181      * </p>
182      *
183      * @return a new tokenizer instance which parses Tab Separated Value strings.
184      */
185     public static StringTokenizer getTSVInstance() {
186         return getTSVClone();
187     }
188 
189     /**
190      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
191      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
192      *
193      * @param input
194      *            the string to parse
195      * @return a new tokenizer instance which parses Tab Separated Value strings.
196      */
197     public static StringTokenizer getTSVInstance(final char[] input) {
198         return getTSVClone().reset(input);
199     }
200 
201     /**
202      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
203      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
204      *
205      * @param input
206      *            the string to parse
207      * @return a new tokenizer instance which parses Tab Separated Value strings.
208      */
209     public static StringTokenizer getTSVInstance(final String input) {
210         return getTSVClone().reset(input);
211     }
212 
213     /** The text to work on. */
214     private char[] chars;
215 
216     /** The parsed tokens. */
217     private String[] tokens;
218 
219     /** The current iteration position. */
220     private int tokenPos;
221 
222     /** The delimiter matcher. */
223     private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
224 
225     /** The quote matcher. */
226     private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
227 
228     /** The ignored matcher. */
229     private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
230 
231     /** The trimmer matcher. */
232     private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
233 
234     /** Whether to return empty tokens as null. */
235     private boolean emptyAsNull;
236 
237     /** Whether to ignore empty tokens. */
238     private boolean ignoreEmptyTokens = true;
239 
240     /**
241      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
242      * tokenize.
243      * <p>
244      * This constructor is normally used with {@link #reset(String)}.
245      * </p>
246      */
247     public StringTokenizer() {
248         this.chars = null;
249     }
250 
251     /**
252      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
253      *
254      * @param input
255      *            the string which is to be parsed, not cloned
256      */
257     public StringTokenizer(final char[] input) {
258         this.chars = input != null ? input.clone() : null;
259     }
260 
261     /**
262      * Constructs a tokenizer splitting on the specified character.
263      *
264      * @param input
265      *            the string which is to be parsed, not cloned
266      * @param delim
267      *            the field delimiter character
268      */
269     public StringTokenizer(final char[] input, final char delim) {
270         this(input);
271         setDelimiterChar(delim);
272     }
273 
274     /**
275      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
276      * quote character.
277      *
278      * @param input
279      *            the string which is to be parsed, not cloned
280      * @param delim
281      *            the field delimiter character
282      * @param quote
283      *            the field quoted string character
284      */
285     public StringTokenizer(final char[] input, final char delim, final char quote) {
286         this(input, delim);
287         setQuoteChar(quote);
288     }
289 
290     /**
291      * Constructs a tokenizer splitting on the specified string.
292      *
293      * @param input
294      *            the string which is to be parsed, not cloned
295      * @param delim
296      *            the field delimiter string
297      */
298     public StringTokenizer(final char[] input, final String delim) {
299         this(input);
300         setDelimiterString(delim);
301     }
302 
303     /**
304      * Constructs a tokenizer splitting using the specified delimiter matcher.
305      *
306      * @param input
307      *            the string which is to be parsed, not cloned
308      * @param delim
309      *            the field delimiter matcher
310      */
311     public StringTokenizer(final char[] input, final StringMatcher delim) {
312         this(input);
313         setDelimiterMatcher(delim);
314     }
315 
316     /**
317      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
318      * quote matcher.
319      *
320      * @param input
321      *            the string which is to be parsed, not cloned
322      * @param delim
323      *            the field delimiter character
324      * @param quote
325      *            the field quoted string character
326      */
327     public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
328         this(input, delim);
329         setQuoteMatcher(quote);
330     }
331 
332     /**
333      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
334      *
335      * @param input
336      *            the string which is to be parsed
337      */
338     public StringTokenizer(final String input) {
339         this.chars = input != null ? input.toCharArray() : null;
340     }
341 
342     /**
343      * Constructs a tokenizer splitting on the specified delimiter character.
344      *
345      * @param input
346      *            the string which is to be parsed
347      * @param delim
348      *            the field delimiter character
349      */
350     public StringTokenizer(final String input, final char delim) {
351         this(input);
352         setDelimiterChar(delim);
353     }
354 
355     /**
356      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
357      * quote character.
358      *
359      * @param input
360      *            the string which is to be parsed
361      * @param delim
362      *            the field delimiter character
363      * @param quote
364      *            the field quoted string character
365      */
366     public StringTokenizer(final String input, final char delim, final char quote) {
367         this(input, delim);
368         setQuoteChar(quote);
369     }
370 
371     /**
372      * Constructs a tokenizer splitting on the specified delimiter string.
373      *
374      * @param input
375      *            the string which is to be parsed
376      * @param delim
377      *            the field delimiter string
378      */
379     public StringTokenizer(final String input, final String delim) {
380         this(input);
381         setDelimiterString(delim);
382     }
383 
384     /**
385      * Constructs a tokenizer splitting using the specified delimiter matcher.
386      *
387      * @param input
388      *            the string which is to be parsed
389      * @param delim
390      *            the field delimiter matcher
391      */
392     public StringTokenizer(final String input, final StringMatcher delim) {
393         this(input);
394         setDelimiterMatcher(delim);
395     }
396 
397     /**
398      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
399      * quote matcher.
400      *
401      * @param input
402      *            the string which is to be parsed
403      * @param delim
404      *            the field delimiter matcher
405      * @param quote
406      *            the field quoted string matcher
407      */
408     public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
409         this(input, delim);
410         setQuoteMatcher(quote);
411     }
412 
413     /**
414      * Unsupported ListIterator operation.
415      *
416      * @param obj
417      *            this parameter ignored.
418      * @throws UnsupportedOperationException
419      *             always
420      */
421     @Override
422     public void add(final String obj) {
423         throw new UnsupportedOperationException("add() is unsupported");
424     }
425 
426     /**
427      * Adds a token to a list, paying attention to the parameters we've set.
428      *
429      * @param list
430      *            the list to add to
431      * @param tok
432      *            the token to add
433      */
434     private void addToken(final List<String> list, String tok) {
435         if (tok == null || tok.isEmpty()) {
436             if (isIgnoreEmptyTokens()) {
437                 return;
438             }
439             if (isEmptyTokenAsNull()) {
440                 tok = null;
441             }
442         }
443         list.add(tok);
444     }
445 
446     /**
447      * Checks if tokenization has been done, and if not then do it.
448      */
449     private void checkTokenized() {
450         if (tokens == null) {
451             final List<String> split;
452             if (chars == null) {
453                 // still call tokenize as subclass may do some work
454                 split = tokenize(null, 0, 0);
455             } else {
456                 split = tokenize(chars, 0, chars.length);
457             }
458             tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
459         }
460     }
461 
462     /**
463      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
464      * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
465      *
466      * @return a new instance of this Tokenizer which has been reset.
467      */
468     @Override
469     public Object clone() {
470         try {
471             return cloneReset();
472         } catch (final CloneNotSupportedException ex) {
473             return null;
474         }
475     }
476 
477     /**
478      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
479      * list.
480      *
481      * @return a new instance of this Tokenizer which has been reset.
482      * @throws CloneNotSupportedException
483      *             if there is a problem cloning
484      */
485     Object cloneReset() throws CloneNotSupportedException {
486         // this method exists to enable 100% test coverage
487         final StringTokenizer cloned = (StringTokenizer) super.clone();
488         if (cloned.chars != null) {
489             cloned.chars = cloned.chars.clone();
490         }
491         cloned.reset();
492         return cloned;
493     }
494 
495     /**
496      * Gets the String content that the tokenizer is parsing.
497      *
498      * @return The string content being parsed
499      */
500     public String getContent() {
501         if (chars == null) {
502             return null;
503         }
504         return new String(chars);
505     }
506 
507     /**
508      * Gets the field delimiter matcher.
509      *
510      * @return The delimiter matcher in use
511      */
512     public StringMatcher getDelimiterMatcher() {
513         return this.delimMatcher;
514     }
515 
516     /**
517      * Gets the ignored character matcher.
518      * <p>
519      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
520      * is not to ignore anything.
521      * </p>
522      *
523      * @return The ignored matcher in use
524      */
525     public StringMatcher getIgnoredMatcher() {
526         return ignoredMatcher;
527     }
528 
529     /**
530      * Gets the quote matcher currently in use.
531      * <p>
532      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
533      * default value is '"' (double quote).
534      * </p>
535      *
536      * @return The quote matcher in use
537      */
538     public StringMatcher getQuoteMatcher() {
539         return quoteMatcher;
540     }
541 
542     /**
543      * Gets a copy of the full token list as an independent modifiable array.
544      *
545      * @return The tokens as a String array
546      */
547     public String[] getTokenArray() {
548         checkTokenized();
549         return tokens.clone();
550     }
551 
552     /**
553      * Gets a copy of the full token list as an independent modifiable list.
554      *
555      * @return The tokens as a String list
556      */
557     public List<String> getTokenList() {
558         checkTokenized();
559         return new ArrayList<>(Arrays.asList(tokens));
560     }
561 
562     /**
563      * Gets the trimmer character matcher.
564      * <p>
565      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
566      * value is not to trim anything.
567      * </p>
568      *
569      * @return The trimmer matcher in use
570      */
571     public StringMatcher getTrimmerMatcher() {
572         return trimmerMatcher;
573     }
574 
575     /**
576      * Tests whether there are any more tokens.
577      *
578      * @return true if there are more tokens
579      */
580     @Override
581     public boolean hasNext() {
582         checkTokenized();
583         return tokenPos < tokens.length;
584     }
585 
586     /**
587      * Tests whether there are any previous tokens that can be iterated to.
588      *
589      * @return true if there are previous tokens
590      */
591     @Override
592     public boolean hasPrevious() {
593         checkTokenized();
594         return tokenPos > 0;
595     }
596 
597     /**
598      * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
599      *
600      * @return true if empty tokens are returned as null
601      */
602     public boolean isEmptyTokenAsNull() {
603         return this.emptyAsNull;
604     }
605 
606     /**
607      * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
608      *
609      * @return true if empty tokens are not returned
610      */
611     public boolean isIgnoreEmptyTokens() {
612         return ignoreEmptyTokens;
613     }
614 
615     /**
616      * Tests if the characters at the index specified match the quote already matched in readNextToken().
617      *
618      * @param srcChars
619      *            the character array being tokenized
620      * @param pos
621      *            the position to check for a quote
622      * @param len
623      *            the length of the character array being tokenized
624      * @param quoteStart
625      *            the start position of the matched quote, 0 if no quoting
626      * @param quoteLen
627      *            the length of the matched quote, 0 if no quoting
628      * @return true if a quote is matched
629      */
630     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
631             final int quoteLen) {
632         for (int i = 0; i < quoteLen; i++) {
633             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
634                 return false;
635             }
636         }
637         return true;
638     }
639 
640     /**
641      * Gets the next token.
642      *
643      * @return The next String token
644      * @throws NoSuchElementException
645      *             if there are no more elements
646      */
647     @Override
648     public String next() {
649         if (hasNext()) {
650             return tokens[tokenPos++];
651         }
652         throw new NoSuchElementException();
653     }
654 
655     /**
656      * Gets the index of the next token to return.
657      *
658      * @return The next token index
659      */
660     @Override
661     public int nextIndex() {
662         return tokenPos;
663     }
664 
665     /**
666      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
667      * {@link NoSuchElementException} when no tokens remain.
668      *
669      * @return The next sequential token, or null when no more tokens are found
670      */
671     public String nextToken() {
672         if (hasNext()) {
673             return tokens[tokenPos++];
674         }
675         return null;
676     }
677 
678     /**
679      * Gets the token previous to the last returned token.
680      *
681      * @return The previous token
682      */
683     @Override
684     public String previous() {
685         if (hasPrevious()) {
686             return tokens[--tokenPos];
687         }
688         throw new NoSuchElementException();
689     }
690 
691     /**
692      * Gets the index of the previous token.
693      *
694      * @return The previous token index
695      */
696     @Override
697     public int previousIndex() {
698         return tokenPos - 1;
699     }
700 
701     /**
702      * Gets the previous token from the String.
703      *
704      * @return The previous sequential token, or null when no more tokens are found
705      */
706     public String previousToken() {
707         if (hasPrevious()) {
708             return tokens[--tokenPos];
709         }
710         return null;
711     }
712 
713     /**
714      * Reads character by character through the String to get the next token.
715      *
716      * @param srcChars
717      *            the character array being tokenized
718      * @param start
719      *            the first character of field
720      * @param len
721      *            the length of the character array being tokenized
722      * @param workArea
723      *            a temporary work area
724      * @param tokenList
725      *            the list of parsed tokens
726      * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
727      *         string found
728      */
729     private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
730             final List<String> tokenList) {
731         // skip all leading whitespace, unless it is the
732         // field delimiter or the quote character
733         while (start < len) {
734             final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
735                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
736             if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
737                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
738                 break;
739             }
740             start += removeLen;
741         }
742 
743         // handle reaching end
744         if (start >= len) {
745             addToken(tokenList, StringUtils.EMPTY);
746             return -1;
747         }
748 
749         // handle empty token
750         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
751         if (delimLen > 0) {
752             addToken(tokenList, StringUtils.EMPTY);
753             return start + delimLen;
754         }
755 
756         // handle found token
757         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
758         if (quoteLen > 0) {
759             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
760         }
761         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
762     }
763 
764     /**
765      * Reads a possibly quoted string token.
766      *
767      * @param srcChars
768      *            the character array being tokenized
769      * @param start
770      *            the first character of field
771      * @param len
772      *            the length of the character array being tokenized
773      * @param workArea
774      *            a temporary work area
775      * @param tokenList
776      *            the list of parsed tokens
777      * @param quoteStart
778      *            the start position of the matched quote, 0 if no quoting
779      * @param quoteLen
780      *            the length of the matched quote, 0 if no quoting
781      * @return The starting position of the next field (the character immediately after the delimiter, or if end of
782      *         string found, then the length of string
783      */
784     private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
785             final List<String> tokenList, final int quoteStart, final int quoteLen) {
786         // Loop until we've found the end of the quoted
787         // string or the end of the input
788         workArea.clear();
789         int pos = start;
790         boolean quoting = quoteLen > 0;
791         int trimStart = 0;
792 
793         while (pos < len) {
794             // quoting mode can occur several times throughout a string
795             // we must switch between quoting and non-quoting until we
796             // encounter a non-quoted delimiter, or end of string
797             if (quoting) {
798                 // In quoting mode
799 
800                 // If we've found a quote character, see if it's
801                 // followed by a second quote. If so, then we need
802                 // to actually put the quote character into the token
803                 // rather than end the token.
804                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
805                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
806                         // matched pair of quotes, thus an escaped quote
807                         workArea.append(srcChars, pos, quoteLen);
808                         pos += quoteLen * 2;
809                         trimStart = workArea.size();
810                         continue;
811                     }
812 
813                     // end of quoting
814                     quoting = false;
815                     pos += quoteLen;
816                     continue;
817                 }
818 
819             } else {
820                 // Not in quoting mode
821 
822                 // check for delimiter, and thus end of token
823                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
824                 if (delimLen > 0) {
825                     // return condition when end of token found
826                     addToken(tokenList, workArea.substring(0, trimStart));
827                     return pos + delimLen;
828                 }
829 
830                 // check for quote, and thus back into quoting mode
831                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
832                     quoting = true;
833                     pos += quoteLen;
834                     continue;
835                 }
836 
837                 // check for ignored (outside quotes), and ignore
838                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
839                 if (ignoredLen > 0) {
840                     pos += ignoredLen;
841                     continue;
842                 }
843 
844                 // check for trimmed character
845                 // don't yet know if its at the end, so copy to workArea
846                 // use trimStart to keep track of trim at the end
847                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
848                 if (trimmedLen > 0) {
849                     workArea.append(srcChars, pos, trimmedLen);
850                     pos += trimmedLen;
851                     continue;
852                 }
853             }
854             // copy regular character from inside quotes
855             workArea.append(srcChars[pos++]);
856             trimStart = workArea.size();
857         }
858 
859         // return condition when end of string found
860         addToken(tokenList, workArea.substring(0, trimStart));
861         return -1;
862     }
863 
864     /**
865      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
866      *
867      * @throws UnsupportedOperationException
868      *             always
869      */
870     @Override
871     public void remove() {
872         throw new UnsupportedOperationException("remove() is unsupported");
873     }
874 
875     /**
876      * Resets this tokenizer, forgetting all parsing and iteration already completed.
877      * <p>
878      * This method allows the same tokenizer to be reused for the same String.
879      * </p>
880      *
881      * @return this, to enable chaining
882      */
883     public StringTokenizer reset() {
884         tokenPos = 0;
885         tokens = null;
886         return this;
887     }
888 
889     /**
890      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
891      * same settings on multiple input lines.
892      *
893      * @param input
894      *            the new character array to tokenize, not cloned, null sets no text to parse
895      * @return this, to enable chaining
896      */
897     public StringTokenizer reset(final char[] input) {
898         reset();
899         this.chars = input != null ? input.clone() : null;
900         return this;
901     }
902 
903     /**
904      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
905      * same settings on multiple input lines.
906      *
907      * @param input
908      *            the new string to tokenize, null sets no text to parse
909      * @return this, to enable chaining
910      */
911     public StringTokenizer reset(final String input) {
912         reset();
913         this.chars = input != null ? input.toCharArray() : null;
914         return this;
915     }
916 
917     /**
918      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
919      *
920      * @param obj
921      *            this parameter ignored.
922      * @throws UnsupportedOperationException
923      *             always
924      */
925     @Override
926     public void set(final String obj) {
927         throw new UnsupportedOperationException("set() is unsupported");
928     }
929 
930     /**
931      * Sets the field delimiter character.
932      *
933      * @param delim
934      *            the delimiter character to use
935      * @return this, to enable chaining
936      */
937     public StringTokenizer setDelimiterChar(final char delim) {
938         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
939     }
940 
941     /**
942      * Sets the field delimiter matcher.
943      * <p>
944      * The delimiter is used to separate one token from another.
945      * </p>
946      *
947      * @param delim
948      *            the delimiter matcher to use
949      * @return this, to enable chaining
950      */
951     public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
952         this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
953         return this;
954     }
955 
956     /**
957      * Sets the field delimiter string.
958      *
959      * @param delim
960      *            the delimiter string to use
961      * @return this, to enable chaining
962      */
963     public StringTokenizer setDelimiterString(final String delim) {
964         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
965     }
966 
967     /**
968      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
969      *
970      * @param emptyAsNull
971      *            whether empty tokens are returned as null
972      * @return this, to enable chaining
973      */
974     public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
975         this.emptyAsNull = emptyAsNull;
976         return this;
977     }
978 
979     /**
980      * Sets the character to ignore.
981      * <p>
982      * This character is ignored when parsing the String, unless it is within a quoted region.
983      * </p>
984      *
985      * @param ignored
986      *            the ignored character to use
987      * @return this, to enable chaining
988      */
989     public StringTokenizer setIgnoredChar(final char ignored) {
990         return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
991     }
992 
993     /**
994      * Sets the matcher for characters to ignore.
995      * <p>
996      * These characters are ignored when parsing the String, unless they are within a quoted region.
997      * </p>
998      *
999      * @param ignored
1000      *            the ignored matcher to use, null ignored
1001      * @return this, to enable chaining
1002      */
1003     public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1004         if (ignored != null) {
1005             this.ignoredMatcher = ignored;
1006         }
1007         return this;
1008     }
1009 
1010     /**
1011      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1012      *
1013      * @param ignoreEmptyTokens
1014      *            whether empty tokens are not returned
1015      * @return this, to enable chaining
1016      */
1017     public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1018         this.ignoreEmptyTokens = ignoreEmptyTokens;
1019         return this;
1020     }
1021 
1022     /**
1023      * Sets the quote character to use.
1024      * <p>
1025      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1026      * </p>
1027      *
1028      * @param quote
1029      *            the quote character to use
1030      * @return this, to enable chaining
1031      */
1032     public StringTokenizer setQuoteChar(final char quote) {
1033         return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
1034     }
1035 
1036     /**
1037      * Sets the quote matcher to use.
1038      * <p>
1039      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1040      * </p>
1041      *
1042      * @param quote
1043      *            the quote matcher to use, null ignored
1044      * @return this, to enable chaining
1045      */
1046     public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
1047         if (quote != null) {
1048             this.quoteMatcher = quote;
1049         }
1050         return this;
1051     }
1052 
1053     /**
1054      * Sets the matcher for characters to trim.
1055      * <p>
1056      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1057      *
1058      * @param trimmer
1059      *            the trimmer matcher to use, null ignored
1060      * @return this, to enable chaining
1061      */
1062     public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1063         if (trimmer != null) {
1064             this.trimmerMatcher = trimmer;
1065         }
1066         return this;
1067     }
1068 
1069     /**
1070      * Gets the number of tokens found in the String.
1071      *
1072      * @return The number of matched tokens
1073      */
1074     public int size() {
1075         checkTokenized();
1076         return tokens.length;
1077     }
1078 
1079     /**
1080      * Internal method to performs the tokenization.
1081      * <p>
1082      * Most users of this class do not need to call this method. This method will be called automatically by other
1083      * (public) methods when required.
1084      * </p>
1085      * <p>
1086      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
1087      * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
1088      * strings. It is also be possible to filter the results.
1089      * </p>
1090      * <p>
1091      * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
1092      * method, however a subclass may pass other values, or even an entirely different array.
1093      * </p>
1094      *
1095      * @param srcChars
1096      *            the character array being tokenized, may be null
1097      * @param offset
1098      *            the start position within the character array, must be valid
1099      * @param count
1100      *            the number of characters to tokenize, must be valid
1101      * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1102      */
1103     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1104         if (srcChars == null || count == 0) {
1105             return Collections.emptyList();
1106         }
1107         final TextStringBuilder buf = new TextStringBuilder();
1108         final List<String> tokenList = new ArrayList<>();
1109         int pos = offset;
1110 
1111         // loop around the entire buffer
1112         while (pos >= 0 && pos < count) {
1113             // find next token
1114             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1115 
1116             // handle case where end of string is a delimiter
1117             if (pos >= count) {
1118                 addToken(tokenList, StringUtils.EMPTY);
1119             }
1120         }
1121         return tokenList;
1122     }
1123 
1124     /**
1125      * Gets the String content that the tokenizer is parsing.
1126      *
1127      * @return The string content being parsed
1128      */
1129     @Override
1130     public String toString() {
1131         if (tokens == null) {
1132             return "StringTokenizer[not tokenized yet]";
1133         }
1134         return "StringTokenizer" + getTokenList();
1135     }
1136 
1137 }