View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  
26  import org.apache.commons.lang3.ArrayUtils;
27  import org.apache.commons.lang3.StringUtils;
28  import org.apache.commons.text.matcher.StringMatcher;
29  import org.apache.commons.text.matcher.StringMatcherFactory;
30  
31  /**
32   * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims to do a similar job to
35   * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
36   * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a
39   * <i>delimiter</i>. One or more delimiter characters must be specified.
40   * <p>
41   * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be
42   * escaped within a quoted section by duplicating itself.
43   * <p>
44   * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher
45   * specifies these characters. One usage might be to trim whitespace characters.
46   * <p>
47   * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies
48   * these characters to be removed. One usage might be to remove new line characters.
49   * <p>
50   * Empty tokens may be removed or returned as null.
51   *
52   * <pre>
53   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
54   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
55   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56   * </pre>
57   *
58   * <table>
59   * <caption>StringTokenizer properties and options</caption>
60   * <tr>
61   * <th>Property</th>
62   * <th>Type</th>
63   * <th>Default</th>
64   * </tr>
65   * <tr>
66   * <td>delim</td>
67   * <td>CharSetMatcher</td>
68   * <td>{ \t\n\r\f}</td>
69   * </tr>
70   * <tr>
71   * <td>quote</td>
72   * <td>NoneMatcher</td>
73   * <td>{}</td>
74   * </tr>
75   * <tr>
76   * <td>ignore</td>
77   * <td>NoneMatcher</td>
78   * <td>{}</td>
79   * </tr>
80   * <tr>
81   * <td>emptyTokenAsNull</td>
82   * <td>boolean</td>
83   * <td>false</td>
84   * </tr>
85   * <tr>
86   * <td>ignoreEmptyTokens</td>
87   * <td>boolean</td>
88   * <td>true</td>
89   * </tr>
90   * </table>
91   *
92   * @since 1.3
93   */
94  public class StringTokenizer implements ListIterator<String>, Cloneable {
95  
96      /** Comma separated values tokenizer internal variable. */
97      private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
98  
99      /** Tab separated values tokenizer internal variable. */
100     private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
101 
102     static {
103         CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
104         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
105         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
106         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
107         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
108         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
109         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
110 
111         TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
112         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
113         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
114         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
115         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
116         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
117         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
118     }
119 
120     /**
121      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122      *
123      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
124      */
125     private static StringTokenizer getCSVClone() {
126         return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
127     }
128 
129     /**
130      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
131      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
132      * setTrimmer method).
133      * <p>
134      * You must call a "reset" method to set the string which you want to parse.
135      * </p>
136      *
137      * @return a new tokenizer instance which parses Comma Separated Value strings
138      */
139     public static StringTokenizer getCSVInstance() {
140         return getCSVClone();
141     }
142 
143     /**
144      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
145      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
146      * setTrimmer method).
147      *
148      * @param input
149      *            the text to parse
150      * @return a new tokenizer instance which parses Comma Separated Value strings
151      */
152     public static StringTokenizer getCSVInstance(final char[] input) {
153         return getCSVClone().reset(input);
154     }
155 
156     /**
157      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
158      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
159      * setTrimmer method).
160      *
161      * @param input
162      *            the text to parse
163      * @return a new tokenizer instance which parses Comma Separated Value strings
164      */
165     public static StringTokenizer getCSVInstance(final String input) {
166         return getCSVClone().reset(input);
167     }
168 
169     /**
170      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171      *
172      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173      */
174     private static StringTokenizer getTSVClone() {
175         return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176     }
177 
178     /**
179      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
180      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
181      * <p>
182      * You must call a "reset" method to set the string which you want to parse.
183      * </p>
184      *
185      * @return a new tokenizer instance which parses Tab Separated Value strings.
186      */
187     public static StringTokenizer getTSVInstance() {
188         return getTSVClone();
189     }
190 
191     /**
192      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
193      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
194      *
195      * @param input
196      *            the string to parse
197      * @return a new tokenizer instance which parses Tab Separated Value strings.
198      */
199     public static StringTokenizer getTSVInstance(final char[] input) {
200         return getTSVClone().reset(input);
201     }
202 
203     /**
204      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
205      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
206      *
207      * @param input
208      *            the string to parse
209      * @return a new tokenizer instance which parses Tab Separated Value strings.
210      */
211     public static StringTokenizer getTSVInstance(final String input) {
212         return getTSVClone().reset(input);
213     }
214 
215     /** The text to work on. */
216     private char[] chars;
217 
218     /** The parsed tokens. */
219     private String[] tokens;
220 
221     /** The current iteration position. */
222     private int tokenPos;
223 
224     /** The delimiter matcher. */
225     private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
226 
227     /** The quote matcher. */
228     private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
229 
230     /** The ignored matcher. */
231     private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
232 
233     /** The trimmer matcher. */
234     private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
235 
236     /** Whether to return empty tokens as null. */
237     private boolean emptyAsNull;
238 
239     /** Whether to ignore empty tokens. */
240     private boolean ignoreEmptyTokens = true;
241 
242     /**
243      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
244      * tokenize.
245      * <p>
246      * This constructor is normally used with {@link #reset(String)}.
247      * </p>
248      */
249     public StringTokenizer() {
250         this.chars = null;
251     }
252 
253     /**
254      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
255      *
256      * @param input
257      *            the string which is to be parsed, not cloned
258      */
259     public StringTokenizer(final char[] input) {
260         this.chars = input != null ? input.clone() : null;
261     }
262 
263     /**
264      * Constructs a tokenizer splitting on the specified character.
265      *
266      * @param input
267      *            the string which is to be parsed, not cloned
268      * @param delim
269      *            the field delimiter character
270      */
271     public StringTokenizer(final char[] input, final char delim) {
272         this(input);
273         setDelimiterChar(delim);
274     }
275 
276     /**
277      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
278      * quote character.
279      *
280      * @param input
281      *            the string which is to be parsed, not cloned
282      * @param delim
283      *            the field delimiter character
284      * @param quote
285      *            the field quoted string character
286      */
287     public StringTokenizer(final char[] input, final char delim, final char quote) {
288         this(input, delim);
289         setQuoteChar(quote);
290     }
291 
292     /**
293      * Constructs a tokenizer splitting on the specified string.
294      *
295      * @param input
296      *            the string which is to be parsed, not cloned
297      * @param delim
298      *            the field delimiter string
299      */
300     public StringTokenizer(final char[] input, final String delim) {
301         this(input);
302         setDelimiterString(delim);
303     }
304 
305     /**
306      * Constructs a tokenizer splitting using the specified delimiter matcher.
307      *
308      * @param input
309      *            the string which is to be parsed, not cloned
310      * @param delim
311      *            the field delimiter matcher
312      */
313     public StringTokenizer(final char[] input, final StringMatcher delim) {
314         this(input);
315         setDelimiterMatcher(delim);
316     }
317 
318     /**
319      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
320      * quote matcher.
321      *
322      * @param input
323      *            the string which is to be parsed, not cloned
324      * @param delim
325      *            the field delimiter character
326      * @param quote
327      *            the field quoted string character
328      */
329     public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
330         this(input, delim);
331         setQuoteMatcher(quote);
332     }
333 
334     /**
335      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
336      *
337      * @param input
338      *            the string which is to be parsed
339      */
340     public StringTokenizer(final String input) {
341         this.chars = input != null ? input.toCharArray() : null;
342     }
343 
344     /**
345      * Constructs a tokenizer splitting on the specified delimiter character.
346      *
347      * @param input
348      *            the string which is to be parsed
349      * @param delim
350      *            the field delimiter character
351      */
352     public StringTokenizer(final String input, final char delim) {
353         this(input);
354         setDelimiterChar(delim);
355     }
356 
357     /**
358      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
359      * quote character.
360      *
361      * @param input
362      *            the string which is to be parsed
363      * @param delim
364      *            the field delimiter character
365      * @param quote
366      *            the field quoted string character
367      */
368     public StringTokenizer(final String input, final char delim, final char quote) {
369         this(input, delim);
370         setQuoteChar(quote);
371     }
372 
373     /**
374      * Constructs a tokenizer splitting on the specified delimiter string.
375      *
376      * @param input
377      *            the string which is to be parsed
378      * @param delim
379      *            the field delimiter string
380      */
381     public StringTokenizer(final String input, final String delim) {
382         this(input);
383         setDelimiterString(delim);
384     }
385 
386     /**
387      * Constructs a tokenizer splitting using the specified delimiter matcher.
388      *
389      * @param input
390      *            the string which is to be parsed
391      * @param delim
392      *            the field delimiter matcher
393      */
394     public StringTokenizer(final String input, final StringMatcher delim) {
395         this(input);
396         setDelimiterMatcher(delim);
397     }
398 
399     /**
400      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
401      * quote matcher.
402      *
403      * @param input
404      *            the string which is to be parsed
405      * @param delim
406      *            the field delimiter matcher
407      * @param quote
408      *            the field quoted string matcher
409      */
410     public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
411         this(input, delim);
412         setQuoteMatcher(quote);
413     }
414 
415     /**
416      * Unsupported ListIterator operation.
417      *
418      * @param obj
419      *            this parameter ignored.
420      * @throws UnsupportedOperationException
421      *             always
422      */
423     @Override
424     public void add(final String obj) {
425         throw new UnsupportedOperationException("add() is unsupported");
426     }
427 
428     /**
429      * Adds a token to a list, paying attention to the parameters we've set.
430      *
431      * @param list
432      *            the list to add to
433      * @param tok
434      *            the token to add
435      */
436     private void addToken(final List<String> list, String tok) {
437         if (tok == null || tok.isEmpty()) {
438             if (isIgnoreEmptyTokens()) {
439                 return;
440             }
441             if (isEmptyTokenAsNull()) {
442                 tok = null;
443             }
444         }
445         list.add(tok);
446     }
447 
448     /**
449      * Checks if tokenization has been done, and if not then do it.
450      */
451     private void checkTokenized() {
452         if (tokens == null) {
453             final List<String> split;
454             if (chars == null) {
455                 // still call tokenize as subclass may do some work
456                 split = tokenize(null, 0, 0);
457             } else {
458                 split = tokenize(chars, 0, chars.length);
459             }
460             tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
461         }
462     }
463 
464     /**
465      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
466      * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
467      *
468      * @return a new instance of this Tokenizer which has been reset.
469      */
470     @Override
471     public Object clone() {
472         try {
473             return cloneReset();
474         } catch (final CloneNotSupportedException ex) {
475             return null;
476         }
477     }
478 
479     /**
480      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
481      * list.
482      *
483      * @return a new instance of this Tokenizer which has been reset.
484      * @throws CloneNotSupportedException
485      *             if there is a problem cloning
486      */
487     Object cloneReset() throws CloneNotSupportedException {
488         // this method exists to enable 100% test coverage
489         final StringTokenizer cloned = (StringTokenizer) super.clone();
490         if (cloned.chars != null) {
491             cloned.chars = cloned.chars.clone();
492         }
493         cloned.reset();
494         return cloned;
495     }
496 
497     /**
498      * Gets the String content that the tokenizer is parsing.
499      *
500      * @return The string content being parsed
501      */
502     public String getContent() {
503         if (chars == null) {
504             return null;
505         }
506         return new String(chars);
507     }
508 
509     /**
510      * Gets the field delimiter matcher.
511      *
512      * @return The delimiter matcher in use
513      */
514     public StringMatcher getDelimiterMatcher() {
515         return this.delimMatcher;
516     }
517 
518     /**
519      * Gets the ignored character matcher.
520      * <p>
521      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
522      * is not to ignore anything.
523      * </p>
524      *
525      * @return The ignored matcher in use
526      */
527     public StringMatcher getIgnoredMatcher() {
528         return ignoredMatcher;
529     }
530 
531     /**
532      * Gets the quote matcher currently in use.
533      * <p>
534      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
535      * default value is '"' (double quote).
536      * </p>
537      *
538      * @return The quote matcher in use
539      */
540     public StringMatcher getQuoteMatcher() {
541         return quoteMatcher;
542     }
543 
544     /**
545      * Gets a copy of the full token list as an independent modifiable array.
546      *
547      * @return The tokens as a String array
548      */
549     public String[] getTokenArray() {
550         checkTokenized();
551         return tokens.clone();
552     }
553 
554     /**
555      * Gets a copy of the full token list as an independent modifiable list.
556      *
557      * @return The tokens as a String list
558      */
559     public List<String> getTokenList() {
560         checkTokenized();
561         return new ArrayList<>(Arrays.asList(tokens));
562     }
563 
564     /**
565      * Gets the trimmer character matcher.
566      * <p>
567      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
568      * value is not to trim anything.
569      * </p>
570      *
571      * @return The trimmer matcher in use
572      */
573     public StringMatcher getTrimmerMatcher() {
574         return trimmerMatcher;
575     }
576 
577     /**
578      * Tests whether there are any more tokens.
579      *
580      * @return true if there are more tokens
581      */
582     @Override
583     public boolean hasNext() {
584         checkTokenized();
585         return tokenPos < tokens.length;
586     }
587 
588     /**
589      * Tests whether there are any previous tokens that can be iterated to.
590      *
591      * @return true if there are previous tokens
592      */
593     @Override
594     public boolean hasPrevious() {
595         checkTokenized();
596         return tokenPos > 0;
597     }
598 
599     /**
600      * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
601      *
602      * @return true if empty tokens are returned as null
603      */
604     public boolean isEmptyTokenAsNull() {
605         return this.emptyAsNull;
606     }
607 
608     /**
609      * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
610      *
611      * @return true if empty tokens are not returned
612      */
613     public boolean isIgnoreEmptyTokens() {
614         return ignoreEmptyTokens;
615     }
616 
617     /**
618      * Tests if the characters at the index specified match the quote already matched in readNextToken().
619      *
620      * @param srcChars
621      *            the character array being tokenized
622      * @param pos
623      *            the position to check for a quote
624      * @param len
625      *            the length of the character array being tokenized
626      * @param quoteStart
627      *            the start position of the matched quote, 0 if no quoting
628      * @param quoteLen
629      *            the length of the matched quote, 0 if no quoting
630      * @return true if a quote is matched
631      */
632     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
633             final int quoteLen) {
634         for (int i = 0; i < quoteLen; i++) {
635             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
636                 return false;
637             }
638         }
639         return true;
640     }
641 
642     /**
643      * Gets the next token.
644      *
645      * @return The next String token
646      * @throws NoSuchElementException
647      *             if there are no more elements
648      */
649     @Override
650     public String next() {
651         if (hasNext()) {
652             return tokens[tokenPos++];
653         }
654         throw new NoSuchElementException();
655     }
656 
657     /**
658      * Gets the index of the next token to return.
659      *
660      * @return The next token index
661      */
662     @Override
663     public int nextIndex() {
664         return tokenPos;
665     }
666 
667     /**
668      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
669      * {@link NoSuchElementException} when no tokens remain.
670      *
671      * @return The next sequential token, or null when no more tokens are found
672      */
673     public String nextToken() {
674         if (hasNext()) {
675             return tokens[tokenPos++];
676         }
677         return null;
678     }
679 
680     /**
681      * Gets the token previous to the last returned token.
682      *
683      * @return The previous token
684      */
685     @Override
686     public String previous() {
687         if (hasPrevious()) {
688             return tokens[--tokenPos];
689         }
690         throw new NoSuchElementException();
691     }
692 
693     /**
694      * Gets the index of the previous token.
695      *
696      * @return The previous token index
697      */
698     @Override
699     public int previousIndex() {
700         return tokenPos - 1;
701     }
702 
703     /**
704      * Gets the previous token from the String.
705      *
706      * @return The previous sequential token, or null when no more tokens are found
707      */
708     public String previousToken() {
709         if (hasPrevious()) {
710             return tokens[--tokenPos];
711         }
712         return null;
713     }
714 
715     /**
716      * Reads character by character through the String to get the next token.
717      *
718      * @param srcChars
719      *            the character array being tokenized
720      * @param start
721      *            the first character of field
722      * @param len
723      *            the length of the character array being tokenized
724      * @param workArea
725      *            a temporary work area
726      * @param tokenList
727      *            the list of parsed tokens
728      * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
729      *         string found
730      */
731     private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
732             final List<String> tokenList) {
733         // skip all leading whitespace, unless it is the
734         // field delimiter or the quote character
735         while (start < len) {
736             final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
737                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
738             if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
739                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
740                 break;
741             }
742             start += removeLen;
743         }
744 
745         // handle reaching end
746         if (start >= len) {
747             addToken(tokenList, StringUtils.EMPTY);
748             return -1;
749         }
750 
751         // handle empty token
752         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
753         if (delimLen > 0) {
754             addToken(tokenList, StringUtils.EMPTY);
755             return start + delimLen;
756         }
757 
758         // handle found token
759         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
760         if (quoteLen > 0) {
761             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
762         }
763         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
764     }
765 
766     /**
767      * Reads a possibly quoted string token.
768      *
769      * @param srcChars
770      *            the character array being tokenized
771      * @param start
772      *            the first character of field
773      * @param len
774      *            the length of the character array being tokenized
775      * @param workArea
776      *            a temporary work area
777      * @param tokenList
778      *            the list of parsed tokens
779      * @param quoteStart
780      *            the start position of the matched quote, 0 if no quoting
781      * @param quoteLen
782      *            the length of the matched quote, 0 if no quoting
783      * @return The starting position of the next field (the character immediately after the delimiter, or if end of
784      *         string found, then the length of string
785      */
786     private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
787             final List<String> tokenList, final int quoteStart, final int quoteLen) {
788         // Loop until we've found the end of the quoted
789         // string or the end of the input
790         workArea.clear();
791         int pos = start;
792         boolean quoting = quoteLen > 0;
793         int trimStart = 0;
794 
795         while (pos < len) {
796             // quoting mode can occur several times throughout a string
797             // we must switch between quoting and non-quoting until we
798             // encounter a non-quoted delimiter, or end of string
799             if (quoting) {
800                 // In quoting mode
801 
802                 // If we've found a quote character, see if it's
803                 // followed by a second quote. If so, then we need
804                 // to actually put the quote character into the token
805                 // rather than end the token.
806                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
807                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
808                         // matched pair of quotes, thus an escaped quote
809                         workArea.append(srcChars, pos, quoteLen);
810                         pos += quoteLen * 2;
811                         trimStart = workArea.size();
812                         continue;
813                     }
814 
815                     // end of quoting
816                     quoting = false;
817                     pos += quoteLen;
818                     continue;
819                 }
820 
821             } else {
822                 // Not in quoting mode
823 
824                 // check for delimiter, and thus end of token
825                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
826                 if (delimLen > 0) {
827                     // return condition when end of token found
828                     addToken(tokenList, workArea.substring(0, trimStart));
829                     return pos + delimLen;
830                 }
831 
832                 // check for quote, and thus back into quoting mode
833                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
834                     quoting = true;
835                     pos += quoteLen;
836                     continue;
837                 }
838 
839                 // check for ignored (outside quotes), and ignore
840                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
841                 if (ignoredLen > 0) {
842                     pos += ignoredLen;
843                     continue;
844                 }
845 
846                 // check for trimmed character
847                 // don't yet know if its at the end, so copy to workArea
848                 // use trimStart to keep track of trim at the end
849                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
850                 if (trimmedLen > 0) {
851                     workArea.append(srcChars, pos, trimmedLen);
852                     pos += trimmedLen;
853                     continue;
854                 }
855             }
856             // copy regular character from inside quotes
857             workArea.append(srcChars[pos++]);
858             trimStart = workArea.size();
859         }
860 
861         // return condition when end of string found
862         addToken(tokenList, workArea.substring(0, trimStart));
863         return -1;
864     }
865 
866     /**
867      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
868      *
869      * @throws UnsupportedOperationException
870      *             always
871      */
872     @Override
873     public void remove() {
874         throw new UnsupportedOperationException("remove() is unsupported");
875     }
876 
877     /**
878      * Resets this tokenizer, forgetting all parsing and iteration already completed.
879      * <p>
880      * This method allows the same tokenizer to be reused for the same String.
881      * </p>
882      *
883      * @return this, to enable chaining
884      */
885     public StringTokenizer reset() {
886         tokenPos = 0;
887         tokens = null;
888         return this;
889     }
890 
891     /**
892      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
893      * same settings on multiple input lines.
894      *
895      * @param input
896      *            the new character array to tokenize, not cloned, null sets no text to parse
897      * @return this, to enable chaining
898      */
899     public StringTokenizer reset(final char[] input) {
900         reset();
901         this.chars = input != null ? input.clone() : null;
902         return this;
903     }
904 
905     /**
906      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
907      * same settings on multiple input lines.
908      *
909      * @param input
910      *            the new string to tokenize, null sets no text to parse
911      * @return this, to enable chaining
912      */
913     public StringTokenizer reset(final String input) {
914         reset();
915         this.chars = input != null ? input.toCharArray() : null;
916         return this;
917     }
918 
919     /**
920      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
921      *
922      * @param obj
923      *            this parameter ignored.
924      * @throws UnsupportedOperationException
925      *             always
926      */
927     @Override
928     public void set(final String obj) {
929         throw new UnsupportedOperationException("set() is unsupported");
930     }
931 
932     /**
933      * Sets the field delimiter character.
934      *
935      * @param delim
936      *            the delimiter character to use
937      * @return this, to enable chaining
938      */
939     public StringTokenizer setDelimiterChar(final char delim) {
940         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
941     }
942 
943     /**
944      * Sets the field delimiter matcher.
945      * <p>
946      * The delimiter is used to separate one token from another.
947      * </p>
948      *
949      * @param delim
950      *            the delimiter matcher to use
951      * @return this, to enable chaining
952      */
953     public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
954         this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
955         return this;
956     }
957 
958     /**
959      * Sets the field delimiter string.
960      *
961      * @param delim
962      *            the delimiter string to use
963      * @return this, to enable chaining
964      */
965     public StringTokenizer setDelimiterString(final String delim) {
966         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
967     }
968 
969     /**
970      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
971      *
972      * @param emptyAsNull
973      *            whether empty tokens are returned as null
974      * @return this, to enable chaining
975      */
976     public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
977         this.emptyAsNull = emptyAsNull;
978         return this;
979     }
980 
981     /**
982      * Sets the character to ignore.
983      * <p>
984      * This character is ignored when parsing the String, unless it is within a quoted region.
985      * </p>
986      *
987      * @param ignored
988      *            the ignored character to use
989      * @return this, to enable chaining
990      */
991     public StringTokenizer setIgnoredChar(final char ignored) {
992         return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
993     }
994 
995     /**
996      * Sets the matcher for characters to ignore.
997      * <p>
998      * These characters are ignored when parsing the String, unless they are within a quoted region.
999      * </p>
1000      *
1001      * @param ignored
1002      *            the ignored matcher to use, null ignored
1003      * @return this, to enable chaining
1004      */
1005     public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1006         if (ignored != null) {
1007             this.ignoredMatcher = ignored;
1008         }
1009         return this;
1010     }
1011 
1012     /**
1013      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1014      *
1015      * @param ignoreEmptyTokens
1016      *            whether empty tokens are not returned
1017      * @return this, to enable chaining
1018      */
1019     public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1020         this.ignoreEmptyTokens = ignoreEmptyTokens;
1021         return this;
1022     }
1023 
1024     /**
1025      * Sets the quote character to use.
1026      * <p>
1027      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1028      * </p>
1029      *
1030      * @param quote
1031      *            the quote character to use
1032      * @return this, to enable chaining
1033      */
1034     public StringTokenizer setQuoteChar(final char quote) {
1035         return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
1036     }
1037 
1038     /**
1039      * Sets the quote matcher to use.
1040      * <p>
1041      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1042      * </p>
1043      *
1044      * @param quote
1045      *            the quote matcher to use, null ignored
1046      * @return this, to enable chaining
1047      */
1048     public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
1049         if (quote != null) {
1050             this.quoteMatcher = quote;
1051         }
1052         return this;
1053     }
1054 
1055     /**
1056      * Sets the matcher for characters to trim.
1057      * <p>
1058      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1059      *
1060      * @param trimmer
1061      *            the trimmer matcher to use, null ignored
1062      * @return this, to enable chaining
1063      */
1064     public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1065         if (trimmer != null) {
1066             this.trimmerMatcher = trimmer;
1067         }
1068         return this;
1069     }
1070 
1071     /**
1072      * Gets the number of tokens found in the String.
1073      *
1074      * @return The number of matched tokens
1075      */
1076     public int size() {
1077         checkTokenized();
1078         return tokens.length;
1079     }
1080 
1081     /**
1082      * Internal method to performs the tokenization.
1083      * <p>
1084      * Most users of this class do not need to call this method. This method will be called automatically by other
1085      * (public) methods when required.
1086      * </p>
1087      * <p>
1088      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
1089      * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
1090      * strings. It is also be possible to filter the results.
1091      * </p>
1092      * <p>
1093      * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
1094      * method, however a subclass may pass other values, or even an entirely different array.
1095      * </p>
1096      *
1097      * @param srcChars
1098      *            the character array being tokenized, may be null
1099      * @param offset
1100      *            the start position within the character array, must be valid
1101      * @param count
1102      *            the number of characters to tokenize, must be valid
1103      * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1104      */
1105     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1106         if (srcChars == null || count == 0) {
1107             return Collections.emptyList();
1108         }
1109         final TextStringBuilder buf = new TextStringBuilder();
1110         final List<String> tokenList = new ArrayList<>();
1111         int pos = offset;
1112 
1113         // loop around the entire buffer
1114         while (pos >= 0 && pos < count) {
1115             // find next token
1116             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1117 
1118             // handle case where end of string is a delimiter
1119             if (pos >= count) {
1120                 addToken(tokenList, StringUtils.EMPTY);
1121             }
1122         }
1123         return tokenList;
1124     }
1125 
1126     /**
1127      * Gets the String content that the tokenizer is parsing.
1128      *
1129      * @return The string content being parsed
1130      */
1131     @Override
1132     public String toString() {
1133         if (tokens == null) {
1134             return "StringTokenizer[not tokenized yet]";
1135         }
1136         return "StringTokenizer" + getTokenList();
1137     }
1138 
1139 }