001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025
026import org.apache.commons.lang3.ArrayUtils;
027import org.apache.commons.lang3.StringUtils;
028import org.apache.commons.text.matcher.StringMatcher;
029import org.apache.commons.text.matcher.StringMatcherFactory;
030
031/**
032 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims to do a similar job to
035 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
036 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
037 * <p>
038 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
039 * <em>delimiter</em>. One or more delimiter characters must be specified.
040 * <p>
041 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
042 * escaped within a quoted section by duplicating itself.
043 * <p>
044 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
045 * specifies these characters. One usage might be to trim whitespace characters.
046 * <p>
047 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
048 * these characters to be removed. One usage might be to remove new line characters.
049 * <p>
050 * Empty tokens may be removed or returned as null.
051 *
052 * <pre>
053 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056 * </pre>
057 *
058 * <table>
059 * <caption>StringTokenizer properties and options</caption>
060 * <tr>
061 * <th>Property</th>
062 * <th>Type</th>
063 * <th>Default</th>
064 * </tr>
065 * <tr>
066 * <td>delim</td>
067 * <td>CharSetMatcher</td>
068 * <td>{ \t\n\r\f}</td>
069 * </tr>
070 * <tr>
071 * <td>quote</td>
072 * <td>NoneMatcher</td>
073 * <td>{}</td>
074 * </tr>
075 * <tr>
076 * <td>ignore</td>
077 * <td>NoneMatcher</td>
078 * <td>{}</td>
079 * </tr>
080 * <tr>
081 * <td>emptyTokenAsNull</td>
082 * <td>boolean</td>
083 * <td>false</td>
084 * </tr>
085 * <tr>
086 * <td>ignoreEmptyTokens</td>
087 * <td>boolean</td>
088 * <td>true</td>
089 * </tr>
090 * </table>
091 *
092 * @since 1.3
093 */
094public class StringTokenizer implements ListIterator<String>, Cloneable {
095
096    /** Comma separated values tokenizer internal variable. */
097    // @formatter:off
098    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
099            .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher())
100            .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
101            .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
102            .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
103            .setEmptyTokenAsNull(false)
104            .setIgnoreEmptyTokens(false);
105    // @formatter:on
106
107    /** Tab separated values tokenizer internal variable. */
108    // @formatter:off
109    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
110            .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher())
111            .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
112            .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
113            .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
114            .setEmptyTokenAsNull(false)
115            .setIgnoreEmptyTokens(false);
116    // @formatter:on
117
118    /**
119     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
120     *
121     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122     */
123    private static StringTokenizer getCSVClone() {
124        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
125    }
126
127    /**
128     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
129     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
130     * setTrimmer method).
131     * <p>
132     * You must call a "reset" method to set the string which you want to parse.
133     * </p>
134     *
135     * @return a new tokenizer instance which parses Comma Separated Value strings
136     */
137    public static StringTokenizer getCSVInstance() {
138        return getCSVClone();
139    }
140
141    /**
142     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
143     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
144     * setTrimmer method).
145     *
146     * @param input
147     *            the text to parse
148     * @return a new tokenizer instance which parses Comma Separated Value strings
149     */
150    public static StringTokenizer getCSVInstance(final char[] input) {
151        return getCSVClone().reset(input);
152    }
153
154    /**
155     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
156     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
157     * setTrimmer method).
158     *
159     * @param input
160     *            the text to parse
161     * @return a new tokenizer instance which parses Comma Separated Value strings
162     */
163    public static StringTokenizer getCSVInstance(final String input) {
164        return getCSVClone().reset(input);
165    }
166
167    /**
168     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
169     *
170     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171     */
172    private static StringTokenizer getTSVClone() {
173        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
174    }
175
176    /**
177     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
178     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
179     * <p>
180     * You must call a "reset" method to set the string which you want to parse.
181     * </p>
182     *
183     * @return a new tokenizer instance which parses Tab Separated Value strings.
184     */
185    public static StringTokenizer getTSVInstance() {
186        return getTSVClone();
187    }
188
189    /**
190     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
191     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
192     *
193     * @param input
194     *            the string to parse
195     * @return a new tokenizer instance which parses Tab Separated Value strings.
196     */
197    public static StringTokenizer getTSVInstance(final char[] input) {
198        return getTSVClone().reset(input);
199    }
200
201    /**
202     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
203     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
204     *
205     * @param input
206     *            the string to parse
207     * @return a new tokenizer instance which parses Tab Separated Value strings.
208     */
209    public static StringTokenizer getTSVInstance(final String input) {
210        return getTSVClone().reset(input);
211    }
212
213    /** The text to work on. */
214    private char[] chars;
215
216    /** The parsed tokens. */
217    private String[] tokens;
218
219    /** The current iteration position. */
220    private int tokenPos;
221
222    /** The delimiter matcher. */
223    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
224
225    /** The quote matcher. */
226    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
227
228    /** The ignored matcher. */
229    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
230
231    /** The trimmer matcher. */
232    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
233
234    /** Whether to return empty tokens as null. */
235    private boolean emptyAsNull;
236
237    /** Whether to ignore empty tokens. */
238    private boolean ignoreEmptyTokens = true;
239
240    /**
241     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
242     * tokenize.
243     * <p>
244     * This constructor is normally used with {@link #reset(String)}.
245     * </p>
246     */
247    public StringTokenizer() {
248        this.chars = null;
249    }
250
251    /**
252     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
253     *
254     * @param input
255     *            the string which is to be parsed, not cloned
256     */
257    public StringTokenizer(final char[] input) {
258        this.chars = input != null ? input.clone() : null;
259    }
260
261    /**
262     * Constructs a tokenizer splitting on the specified character.
263     *
264     * @param input
265     *            the string which is to be parsed, not cloned
266     * @param delim
267     *            the field delimiter character
268     */
269    public StringTokenizer(final char[] input, final char delim) {
270        this(input);
271        setDelimiterChar(delim);
272    }
273
274    /**
275     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
276     * quote character.
277     *
278     * @param input
279     *            the string which is to be parsed, not cloned
280     * @param delim
281     *            the field delimiter character
282     * @param quote
283     *            the field quoted string character
284     */
285    public StringTokenizer(final char[] input, final char delim, final char quote) {
286        this(input, delim);
287        setQuoteChar(quote);
288    }
289
290    /**
291     * Constructs a tokenizer splitting on the specified string.
292     *
293     * @param input
294     *            the string which is to be parsed, not cloned
295     * @param delim
296     *            the field delimiter string
297     */
298    public StringTokenizer(final char[] input, final String delim) {
299        this(input);
300        setDelimiterString(delim);
301    }
302
303    /**
304     * Constructs a tokenizer splitting using the specified delimiter matcher.
305     *
306     * @param input
307     *            the string which is to be parsed, not cloned
308     * @param delim
309     *            the field delimiter matcher
310     */
311    public StringTokenizer(final char[] input, final StringMatcher delim) {
312        this(input);
313        setDelimiterMatcher(delim);
314    }
315
316    /**
317     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
318     * quote matcher.
319     *
320     * @param input
321     *            the string which is to be parsed, not cloned
322     * @param delim
323     *            the field delimiter character
324     * @param quote
325     *            the field quoted string character
326     */
327    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
328        this(input, delim);
329        setQuoteMatcher(quote);
330    }
331
332    /**
333     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
334     *
335     * @param input
336     *            the string which is to be parsed
337     */
338    public StringTokenizer(final String input) {
339        this.chars = input != null ? input.toCharArray() : null;
340    }
341
342    /**
343     * Constructs a tokenizer splitting on the specified delimiter character.
344     *
345     * @param input
346     *            the string which is to be parsed
347     * @param delim
348     *            the field delimiter character
349     */
350    public StringTokenizer(final String input, final char delim) {
351        this(input);
352        setDelimiterChar(delim);
353    }
354
355    /**
356     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
357     * quote character.
358     *
359     * @param input
360     *            the string which is to be parsed
361     * @param delim
362     *            the field delimiter character
363     * @param quote
364     *            the field quoted string character
365     */
366    public StringTokenizer(final String input, final char delim, final char quote) {
367        this(input, delim);
368        setQuoteChar(quote);
369    }
370
371    /**
372     * Constructs a tokenizer splitting on the specified delimiter string.
373     *
374     * @param input
375     *            the string which is to be parsed
376     * @param delim
377     *            the field delimiter string
378     */
379    public StringTokenizer(final String input, final String delim) {
380        this(input);
381        setDelimiterString(delim);
382    }
383
384    /**
385     * Constructs a tokenizer splitting using the specified delimiter matcher.
386     *
387     * @param input
388     *            the string which is to be parsed
389     * @param delim
390     *            the field delimiter matcher
391     */
392    public StringTokenizer(final String input, final StringMatcher delim) {
393        this(input);
394        setDelimiterMatcher(delim);
395    }
396
397    /**
398     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
399     * quote matcher.
400     *
401     * @param input
402     *            the string which is to be parsed
403     * @param delim
404     *            the field delimiter matcher
405     * @param quote
406     *            the field quoted string matcher
407     */
408    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
409        this(input, delim);
410        setQuoteMatcher(quote);
411    }
412
413    /**
414     * Unsupported ListIterator operation.
415     *
416     * @param obj
417     *            this parameter ignored.
418     * @throws UnsupportedOperationException
419     *             always
420     */
421    @Override
422    public void add(final String obj) {
423        throw new UnsupportedOperationException("add() is unsupported");
424    }
425
426    /**
427     * Adds a token to a list, paying attention to the parameters we've set.
428     *
429     * @param list
430     *            the list to add to
431     * @param tok
432     *            the token to add
433     */
434    private void addToken(final List<String> list, String tok) {
435        if (tok == null || tok.isEmpty()) {
436            if (isIgnoreEmptyTokens()) {
437                return;
438            }
439            if (isEmptyTokenAsNull()) {
440                tok = null;
441            }
442        }
443        list.add(tok);
444    }
445
446    /**
447     * Checks if tokenization has been done, and if not then do it.
448     */
449    private void checkTokenized() {
450        if (tokens == null) {
451            final List<String> split;
452            if (chars == null) {
453                // still call tokenize as subclass may do some work
454                split = tokenize(null, 0, 0);
455            } else {
456                split = tokenize(chars, 0, chars.length);
457            }
458            tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
459        }
460    }
461
462    /**
463     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
464     * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
465     *
466     * @return a new instance of this Tokenizer which has been reset.
467     */
468    @Override
469    public Object clone() {
470        try {
471            return cloneReset();
472        } catch (final CloneNotSupportedException ex) {
473            return null;
474        }
475    }
476
477    /**
478     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
479     * list.
480     *
481     * @return a new instance of this Tokenizer which has been reset.
482     * @throws CloneNotSupportedException
483     *             if there is a problem cloning
484     */
485    Object cloneReset() throws CloneNotSupportedException {
486        // this method exists to enable 100% test coverage
487        final StringTokenizer cloned = (StringTokenizer) super.clone();
488        if (cloned.chars != null) {
489            cloned.chars = cloned.chars.clone();
490        }
491        cloned.reset();
492        return cloned;
493    }
494
495    /**
496     * Gets the String content that the tokenizer is parsing.
497     *
498     * @return The string content being parsed
499     */
500    public String getContent() {
501        if (chars == null) {
502            return null;
503        }
504        return new String(chars);
505    }
506
507    /**
508     * Gets the field delimiter matcher.
509     *
510     * @return The delimiter matcher in use
511     */
512    public StringMatcher getDelimiterMatcher() {
513        return this.delimMatcher;
514    }
515
516    /**
517     * Gets the ignored character matcher.
518     * <p>
519     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
520     * is not to ignore anything.
521     * </p>
522     *
523     * @return The ignored matcher in use
524     */
525    public StringMatcher getIgnoredMatcher() {
526        return ignoredMatcher;
527    }
528
529    /**
530     * Gets the quote matcher currently in use.
531     * <p>
532     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
533     * default value is '"' (double quote).
534     * </p>
535     *
536     * @return The quote matcher in use
537     */
538    public StringMatcher getQuoteMatcher() {
539        return quoteMatcher;
540    }
541
542    /**
543     * Gets a copy of the full token list as an independent modifiable array.
544     *
545     * @return The tokens as a String array
546     */
547    public String[] getTokenArray() {
548        checkTokenized();
549        return tokens.clone();
550    }
551
552    /**
553     * Gets a copy of the full token list as an independent modifiable list.
554     *
555     * @return The tokens as a String list
556     */
557    public List<String> getTokenList() {
558        checkTokenized();
559        return new ArrayList<>(Arrays.asList(tokens));
560    }
561
562    /**
563     * Gets the trimmer character matcher.
564     * <p>
565     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
566     * value is not to trim anything.
567     * </p>
568     *
569     * @return The trimmer matcher in use
570     */
571    public StringMatcher getTrimmerMatcher() {
572        return trimmerMatcher;
573    }
574
575    /**
576     * Tests whether there are any more tokens.
577     *
578     * @return true if there are more tokens
579     */
580    @Override
581    public boolean hasNext() {
582        checkTokenized();
583        return tokenPos < tokens.length;
584    }
585
586    /**
587     * Tests whether there are any previous tokens that can be iterated to.
588     *
589     * @return true if there are previous tokens
590     */
591    @Override
592    public boolean hasPrevious() {
593        checkTokenized();
594        return tokenPos > 0;
595    }
596
597    /**
598     * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
599     *
600     * @return true if empty tokens are returned as null
601     */
602    public boolean isEmptyTokenAsNull() {
603        return this.emptyAsNull;
604    }
605
606    /**
607     * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
608     *
609     * @return true if empty tokens are not returned
610     */
611    public boolean isIgnoreEmptyTokens() {
612        return ignoreEmptyTokens;
613    }
614
615    /**
616     * Tests if the characters at the index specified match the quote already matched in readNextToken().
617     *
618     * @param srcChars
619     *            the character array being tokenized
620     * @param pos
621     *            the position to check for a quote
622     * @param len
623     *            the length of the character array being tokenized
624     * @param quoteStart
625     *            the start position of the matched quote, 0 if no quoting
626     * @param quoteLen
627     *            the length of the matched quote, 0 if no quoting
628     * @return true if a quote is matched
629     */
630    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
631            final int quoteLen) {
632        for (int i = 0; i < quoteLen; i++) {
633            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
634                return false;
635            }
636        }
637        return true;
638    }
639
640    /**
641     * Gets the next token.
642     *
643     * @return The next String token
644     * @throws NoSuchElementException
645     *             if there are no more elements
646     */
647    @Override
648    public String next() {
649        if (hasNext()) {
650            return tokens[tokenPos++];
651        }
652        throw new NoSuchElementException();
653    }
654
655    /**
656     * Gets the index of the next token to return.
657     *
658     * @return The next token index
659     */
660    @Override
661    public int nextIndex() {
662        return tokenPos;
663    }
664
665    /**
666     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
667     * {@link NoSuchElementException} when no tokens remain.
668     *
669     * @return The next sequential token, or null when no more tokens are found
670     */
671    public String nextToken() {
672        if (hasNext()) {
673            return tokens[tokenPos++];
674        }
675        return null;
676    }
677
678    /**
679     * Gets the token previous to the last returned token.
680     *
681     * @return The previous token
682     */
683    @Override
684    public String previous() {
685        if (hasPrevious()) {
686            return tokens[--tokenPos];
687        }
688        throw new NoSuchElementException();
689    }
690
691    /**
692     * Gets the index of the previous token.
693     *
694     * @return The previous token index
695     */
696    @Override
697    public int previousIndex() {
698        return tokenPos - 1;
699    }
700
701    /**
702     * Gets the previous token from the String.
703     *
704     * @return The previous sequential token, or null when no more tokens are found
705     */
706    public String previousToken() {
707        if (hasPrevious()) {
708            return tokens[--tokenPos];
709        }
710        return null;
711    }
712
713    /**
714     * Reads character by character through the String to get the next token.
715     *
716     * @param srcChars
717     *            the character array being tokenized
718     * @param start
719     *            the first character of field
720     * @param len
721     *            the length of the character array being tokenized
722     * @param workArea
723     *            a temporary work area
724     * @param tokenList
725     *            the list of parsed tokens
726     * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
727     *         string found
728     */
729    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
730            final List<String> tokenList) {
731        // skip all leading whitespace, unless it is the
732        // field delimiter or the quote character
733        while (start < len) {
734            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
735                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
736            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
737                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
738                break;
739            }
740            start += removeLen;
741        }
742
743        // handle reaching end
744        if (start >= len) {
745            addToken(tokenList, StringUtils.EMPTY);
746            return -1;
747        }
748
749        // handle empty token
750        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
751        if (delimLen > 0) {
752            addToken(tokenList, StringUtils.EMPTY);
753            return start + delimLen;
754        }
755
756        // handle found token
757        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
758        if (quoteLen > 0) {
759            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
760        }
761        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
762    }
763
764    /**
765     * Reads a possibly quoted string token.
766     *
767     * @param srcChars
768     *            the character array being tokenized
769     * @param start
770     *            the first character of field
771     * @param len
772     *            the length of the character array being tokenized
773     * @param workArea
774     *            a temporary work area
775     * @param tokenList
776     *            the list of parsed tokens
777     * @param quoteStart
778     *            the start position of the matched quote, 0 if no quoting
779     * @param quoteLen
780     *            the length of the matched quote, 0 if no quoting
781     * @return The starting position of the next field (the character immediately after the delimiter, or if end of
782     *         string found, then the length of string
783     */
784    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
785            final List<String> tokenList, final int quoteStart, final int quoteLen) {
786        // Loop until we've found the end of the quoted
787        // string or the end of the input
788        workArea.clear();
789        int pos = start;
790        boolean quoting = quoteLen > 0;
791        int trimStart = 0;
792
793        while (pos < len) {
794            // quoting mode can occur several times throughout a string
795            // we must switch between quoting and non-quoting until we
796            // encounter a non-quoted delimiter, or end of string
797            if (quoting) {
798                // In quoting mode
799
800                // If we've found a quote character, see if it's
801                // followed by a second quote. If so, then we need
802                // to actually put the quote character into the token
803                // rather than end the token.
804                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
805                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
806                        // matched pair of quotes, thus an escaped quote
807                        workArea.append(srcChars, pos, quoteLen);
808                        pos += quoteLen * 2;
809                        trimStart = workArea.size();
810                        continue;
811                    }
812
813                    // end of quoting
814                    quoting = false;
815                    pos += quoteLen;
816                    continue;
817                }
818
819            } else {
820                // Not in quoting mode
821
822                // check for delimiter, and thus end of token
823                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
824                if (delimLen > 0) {
825                    // return condition when end of token found
826                    addToken(tokenList, workArea.substring(0, trimStart));
827                    return pos + delimLen;
828                }
829
830                // check for quote, and thus back into quoting mode
831                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
832                    quoting = true;
833                    pos += quoteLen;
834                    continue;
835                }
836
837                // check for ignored (outside quotes), and ignore
838                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
839                if (ignoredLen > 0) {
840                    pos += ignoredLen;
841                    continue;
842                }
843
844                // check for trimmed character
845                // don't yet know if its at the end, so copy to workArea
846                // use trimStart to keep track of trim at the end
847                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
848                if (trimmedLen > 0) {
849                    workArea.append(srcChars, pos, trimmedLen);
850                    pos += trimmedLen;
851                    continue;
852                }
853            }
854            // copy regular character from inside quotes
855            workArea.append(srcChars[pos++]);
856            trimStart = workArea.size();
857        }
858
859        // return condition when end of string found
860        addToken(tokenList, workArea.substring(0, trimStart));
861        return -1;
862    }
863
864    /**
865     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
866     *
867     * @throws UnsupportedOperationException
868     *             always
869     */
870    @Override
871    public void remove() {
872        throw new UnsupportedOperationException("remove() is unsupported");
873    }
874
875    /**
876     * Resets this tokenizer, forgetting all parsing and iteration already completed.
877     * <p>
878     * This method allows the same tokenizer to be reused for the same String.
879     * </p>
880     *
881     * @return this, to enable chaining
882     */
883    public StringTokenizer reset() {
884        tokenPos = 0;
885        tokens = null;
886        return this;
887    }
888
889    /**
890     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
891     * same settings on multiple input lines.
892     *
893     * @param input
894     *            the new character array to tokenize, not cloned, null sets no text to parse
895     * @return this, to enable chaining
896     */
897    public StringTokenizer reset(final char[] input) {
898        reset();
899        this.chars = input != null ? input.clone() : null;
900        return this;
901    }
902
903    /**
904     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
905     * same settings on multiple input lines.
906     *
907     * @param input
908     *            the new string to tokenize, null sets no text to parse
909     * @return this, to enable chaining
910     */
911    public StringTokenizer reset(final String input) {
912        reset();
913        this.chars = input != null ? input.toCharArray() : null;
914        return this;
915    }
916
917    /**
918     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
919     *
920     * @param obj
921     *            this parameter ignored.
922     * @throws UnsupportedOperationException
923     *             always
924     */
925    @Override
926    public void set(final String obj) {
927        throw new UnsupportedOperationException("set() is unsupported");
928    }
929
930    /**
931     * Sets the field delimiter character.
932     *
933     * @param delim
934     *            the delimiter character to use
935     * @return this, to enable chaining
936     */
937    public StringTokenizer setDelimiterChar(final char delim) {
938        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
939    }
940
941    /**
942     * Sets the field delimiter matcher.
943     * <p>
944     * The delimiter is used to separate one token from another.
945     * </p>
946     *
947     * @param delim
948     *            the delimiter matcher to use
949     * @return this, to enable chaining
950     */
951    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
952        this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
953        return this;
954    }
955
956    /**
957     * Sets the field delimiter string.
958     *
959     * @param delim
960     *            the delimiter string to use
961     * @return this, to enable chaining
962     */
963    public StringTokenizer setDelimiterString(final String delim) {
964        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
965    }
966
967    /**
968     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
969     *
970     * @param emptyAsNull
971     *            whether empty tokens are returned as null
972     * @return this, to enable chaining
973     */
974    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
975        this.emptyAsNull = emptyAsNull;
976        return this;
977    }
978
979    /**
980     * Sets the character to ignore.
981     * <p>
982     * This character is ignored when parsing the String, unless it is within a quoted region.
983     * </p>
984     *
985     * @param ignored
986     *            the ignored character to use
987     * @return this, to enable chaining
988     */
989    public StringTokenizer setIgnoredChar(final char ignored) {
990        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
991    }
992
993    /**
994     * Sets the matcher for characters to ignore.
995     * <p>
996     * These characters are ignored when parsing the String, unless they are within a quoted region.
997     * </p>
998     *
999     * @param ignored
1000     *            the ignored matcher to use, null ignored
1001     * @return this, to enable chaining
1002     */
1003    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1004        if (ignored != null) {
1005            this.ignoredMatcher = ignored;
1006        }
1007        return this;
1008    }
1009
1010    /**
1011     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1012     *
1013     * @param ignoreEmptyTokens
1014     *            whether empty tokens are not returned
1015     * @return this, to enable chaining
1016     */
1017    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1018        this.ignoreEmptyTokens = ignoreEmptyTokens;
1019        return this;
1020    }
1021
1022    /**
1023     * Sets the quote character to use.
1024     * <p>
1025     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1026     * </p>
1027     *
1028     * @param quote
1029     *            the quote character to use
1030     * @return this, to enable chaining
1031     */
1032    public StringTokenizer setQuoteChar(final char quote) {
1033        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
1034    }
1035
1036    /**
1037     * Sets the quote matcher to use.
1038     * <p>
1039     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1040     * </p>
1041     *
1042     * @param quote
1043     *            the quote matcher to use, null ignored
1044     * @return this, to enable chaining
1045     */
1046    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
1047        if (quote != null) {
1048            this.quoteMatcher = quote;
1049        }
1050        return this;
1051    }
1052
1053    /**
1054     * Sets the matcher for characters to trim.
1055     * <p>
1056     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1057     *
1058     * @param trimmer
1059     *            the trimmer matcher to use, null ignored
1060     * @return this, to enable chaining
1061     */
1062    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1063        if (trimmer != null) {
1064            this.trimmerMatcher = trimmer;
1065        }
1066        return this;
1067    }
1068
1069    /**
1070     * Gets the number of tokens found in the String.
1071     *
1072     * @return The number of matched tokens
1073     */
1074    public int size() {
1075        checkTokenized();
1076        return tokens.length;
1077    }
1078
1079    /**
1080     * Internal method to performs the tokenization.
1081     * <p>
1082     * Most users of this class do not need to call this method. This method will be called automatically by other
1083     * (public) methods when required.
1084     * </p>
1085     * <p>
1086     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
1087     * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
1088     * strings. It is also be possible to filter the results.
1089     * </p>
1090     * <p>
1091     * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
1092     * method, however a subclass may pass other values, or even an entirely different array.
1093     * </p>
1094     *
1095     * @param srcChars
1096     *            the character array being tokenized, may be null
1097     * @param offset
1098     *            the start position within the character array, must be valid
1099     * @param count
1100     *            the number of characters to tokenize, must be valid
1101     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1102     */
1103    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1104        if (srcChars == null || count == 0) {
1105            return Collections.emptyList();
1106        }
1107        final TextStringBuilder buf = new TextStringBuilder();
1108        final List<String> tokenList = new ArrayList<>();
1109        int pos = offset;
1110
1111        // loop around the entire buffer
1112        while (pos >= 0 && pos < count) {
1113            // find next token
1114            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1115
1116            // handle case where end of string is a delimiter
1117            if (pos >= count) {
1118                addToken(tokenList, StringUtils.EMPTY);
1119            }
1120        }
1121        return tokenList;
1122    }
1123
1124    /**
1125     * Gets the String content that the tokenizer is parsing.
1126     *
1127     * @return The string content being parsed
1128     */
1129    @Override
1130    public String toString() {
1131        if (tokens == null) {
1132            return "StringTokenizer[not tokenized yet]";
1133        }
1134        return "StringTokenizer" + getTokenList();
1135    }
1136
1137}