001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025
026import org.apache.commons.lang3.ArrayUtils;
027import org.apache.commons.lang3.StringUtils;
028import org.apache.commons.text.matcher.StringMatcher;
029import org.apache.commons.text.matcher.StringMatcherFactory;
030
031/**
032 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims to do a similar job to
035 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
036 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
037 * <p>
038 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
039 * <em>delimiter</em>. One or more delimiter characters must be specified.
040 * <p>
041 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
042 * escaped within a quoted section by duplicating itself.
043 * <p>
044 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
045 * specifies these characters. One usage might be to trim whitespace characters.
046 * <p>
047 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
048 * these characters to be removed. One usage might be to remove new line characters.
049 * <p>
050 * Empty tokens may be removed or returned as null.
051 *
052 * <pre>
053 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056 * </pre>
057 *
058 * <table>
059 * <caption>StringTokenizer properties and options</caption>
060 * <tr>
061 * <th>Property</th>
062 * <th>Type</th>
063 * <th>Default</th>
064 * </tr>
065 * <tr>
066 * <td>delim</td>
067 * <td>CharSetMatcher</td>
068 * <td>{ \t\n\r\f}</td>
069 * </tr>
070 * <tr>
071 * <td>quote</td>
072 * <td>NoneMatcher</td>
073 * <td>{}</td>
074 * </tr>
075 * <tr>
076 * <td>ignore</td>
077 * <td>NoneMatcher</td>
078 * <td>{}</td>
079 * </tr>
080 * <tr>
081 * <td>emptyTokenAsNull</td>
082 * <td>boolean</td>
083 * <td>false</td>
084 * </tr>
085 * <tr>
086 * <td>ignoreEmptyTokens</td>
087 * <td>boolean</td>
088 * <td>true</td>
089 * </tr>
090 * </table>
091 *
092 * @since 1.3
093 */
094public class StringTokenizer implements ListIterator<String>, Cloneable {
095
096    /** Comma separated values tokenizer internal variable. */
097    // @formatter:off
098    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
099            .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher())
100            .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
101            .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
102            .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
103            .setEmptyTokenAsNull(false)
104            .setIgnoreEmptyTokens(false);
105    // @formatter:on
106
107    /** Tab separated values tokenizer internal variable. */
108    // @formatter:off
109    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
110            .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher())
111            .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
112            .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
113            .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
114            .setEmptyTokenAsNull(false)
115            .setIgnoreEmptyTokens(false);
116    // @formatter:on
117
118    /**
119     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
120     *
121     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122     */
123    private static StringTokenizer getCSVClone() {
124        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
125    }
126
127    /**
128     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
129     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
130     * setTrimmer method).
131     * <p>
132     * You must call a "reset" method to set the string which you want to parse.
133     * </p>
134     *
135     * @return a new tokenizer instance which parses Comma Separated Value strings.
136     */
137    public static StringTokenizer getCSVInstance() {
138        return getCSVClone();
139    }
140
141    /**
142     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
143     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
144     *
145     * @param input the text to parse.
146     * @return a new tokenizer instance which parses Comma Separated Value strings.
147     */
148    public static StringTokenizer getCSVInstance(final char[] input) {
149        return getCSVClone().reset(input);
150    }
151
152    /**
153     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
154     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
155     *
156     * @param input the text to parse.
157     * @return a new tokenizer instance which parses Comma Separated Value strings.
158     */
159    public static StringTokenizer getCSVInstance(final String input) {
160        return getCSVClone().reset(input);
161    }
162
163    /**
164     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
165     *
166     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167     */
168    private static StringTokenizer getTSVClone() {
169        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
170    }
171
172    /**
173     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
174     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
175     * <p>
176     * You must call a "reset" method to set the string which you want to parse.
177     * </p>
178     *
179     * @return a new tokenizer instance which parses Tab Separated Value strings.
180     */
181    public static StringTokenizer getTSVInstance() {
182        return getTSVClone();
183    }
184
185    /**
186     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
187     * be overridden with the setTrimmer method).
188     *
189     * @param input the string to parse.
190     * @return a new tokenizer instance which parses Tab Separated Value strings.
191     */
192    public static StringTokenizer getTSVInstance(final char[] input) {
193        return getTSVClone().reset(input);
194    }
195
196    /**
197     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
198     * be overridden with the setTrimmer method).
199     *
200     * @param input the string to parse.
201     * @return a new tokenizer instance which parses Tab Separated Value strings.
202     */
203    public static StringTokenizer getTSVInstance(final String input) {
204        return getTSVClone().reset(input);
205    }
206
207    /** The text to work on. */
208    private char[] chars;
209
210    /** The parsed tokens. */
211    private String[] tokens;
212
213    /** The current iteration position. */
214    private int tokenPos;
215
216    /** The delimiter matcher. */
217    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
218
219    /** The quote matcher. */
220    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
221
222    /** The ignored matcher. */
223    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
224
225    /** The trimmer matcher. */
226    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
227
228    /** Whether to return empty tokens as null. */
229    private boolean emptyAsNull;
230
231    /** Whether to ignore empty tokens. */
232    private boolean ignoreEmptyTokens = true;
233
234    /**
235     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236     * <p>
237     * This constructor is normally used with {@link #reset(String)}.
238     * </p>
239     */
240    public StringTokenizer() {
241        this.chars = null;
242    }
243
244    /**
245     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
246     *
247     * @param input the string which is to be parsed, not cloned.
248     */
249    public StringTokenizer(final char[] input) {
250        this.chars = input != null ? input.clone() : null;
251    }
252
253    /**
254     * Constructs a tokenizer splitting on the specified character.
255     *
256     * @param input the string which is to be parsed, not cloned.
257     * @param delim the field delimiter character.
258     */
259    public StringTokenizer(final char[] input, final char delim) {
260        this(input);
261        setDelimiterChar(delim);
262    }
263
264    /**
265     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character.
266     *
267     * @param input the string which is to be parsed, not cloned.
268     * @param delim the field delimiter character.
269     * @param quote the field quoted string character.
270     */
271    public StringTokenizer(final char[] input, final char delim, final char quote) {
272        this(input, delim);
273        setQuoteChar(quote);
274    }
275
276    /**
277     * Constructs a tokenizer splitting on the specified string.
278     *
279     * @param input the string which is to be parsed, not cloned.
280     * @param delim the field delimiter string.
281     */
282    public StringTokenizer(final char[] input, final String delim) {
283        this(input);
284        setDelimiterString(delim);
285    }
286
287    /**
288     * Constructs a tokenizer splitting using the specified delimiter matcher.
289     *
290     * @param input the string which is to be parsed, not cloned.
291     * @param delim the field delimiter matcher.
292     */
293    public StringTokenizer(final char[] input, final StringMatcher delim) {
294        this(input);
295        setDelimiterMatcher(delim);
296    }
297
298    /**
299     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher.
300     *
301     * @param input the string which is to be parsed, not cloned.
302     * @param delim the field delimiter character.
303     * @param quote the field quoted string character.
304     */
305    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
306        this(input, delim);
307        setQuoteMatcher(quote);
308    }
309
310    /**
311     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
312     *
313     * @param input the string which is to be parsed.
314     */
315    public StringTokenizer(final String input) {
316        this.chars = input != null ? input.toCharArray() : null;
317    }
318
319    /**
320     * Constructs a tokenizer splitting on the specified delimiter character.
321     *
322     * @param input the string which is to be parsed.
323     * @param delim the field delimiter character.
324     */
325    public StringTokenizer(final String input, final char delim) {
326        this(input);
327        setDelimiterChar(delim);
328    }
329
330    /**
331     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character.
332     *
333     * @param input the string which is to be parsed.
334     * @param delim the field delimiter character.
335     * @param quote the field quoted string character.
336     */
337    public StringTokenizer(final String input, final char delim, final char quote) {
338        this(input, delim);
339        setQuoteChar(quote);
340    }
341
342    /**
343     * Constructs a tokenizer splitting on the specified delimiter string.
344     *
345     * @param input the string which is to be parsed.
346     * @param delim the field delimiter string.
347     */
348    public StringTokenizer(final String input, final String delim) {
349        this(input);
350        setDelimiterString(delim);
351    }
352
353    /**
354     * Constructs a tokenizer splitting using the specified delimiter matcher.
355     *
356     * @param input the string which is to be parsed.
357     * @param delim the field delimiter matcher.
358     */
359    public StringTokenizer(final String input, final StringMatcher delim) {
360        this(input);
361        setDelimiterMatcher(delim);
362    }
363
364    /**
365     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher.
366     *
367     * @param input the string which is to be parsed.
368     * @param delim the field delimiter matcher.
369     * @param quote the field quoted string matcher.
370     */
371    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
372        this(input, delim);
373        setQuoteMatcher(quote);
374    }
375
376    /**
377     * Unsupported ListIterator operation.
378     *
379     * @param obj this parameter ignored.
380     * @throws UnsupportedOperationException always.
381     */
382    @Override
383    public void add(final String obj) {
384        throw new UnsupportedOperationException("add() is unsupported");
385    }
386
387    /**
388     * Adds a token to a list, paying attention to the parameters we've set.
389     *
390     * @param list the list to add to.
391     * @param tok  the token to add.
392     */
393    private void addToken(final List<String> list, String tok) {
394        if (tok == null || tok.isEmpty()) {
395            if (isIgnoreEmptyTokens()) {
396                return;
397            }
398            if (isEmptyTokenAsNull()) {
399                tok = null;
400            }
401        }
402        list.add(tok);
403    }
404
405    /**
406     * Checks if tokenization has been done, and if not then do it.
407     */
408    private void checkTokenized() {
409        if (tokens == null) {
410            final List<String> split;
411            if (chars == null) {
412                // still call tokenize as subclass may do some work.
413                split = tokenize(null, 0, 0);
414            } else {
415                split = tokenize(chars, 0, chars.length);
416            }
417            tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
418        }
419    }
420
421    /**
422     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
423     * {@link CloneNotSupportedException} is caught, return {@code null}.
424     *
425     * @return a new instance of this Tokenizer which has been reset.
426     */
427    @Override
428    public Object clone() {
429        try {
430            return cloneReset();
431        } catch (final CloneNotSupportedException ex) {
432            return null;
433        }
434    }
435
436    /**
437     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
438     *
439     * @return a new instance of this Tokenizer which has been reset.
440     * @throws CloneNotSupportedException if there is a problem cloning.
441     */
442    Object cloneReset() throws CloneNotSupportedException {
443        // this method exists to enable 100% test coverage
444        final StringTokenizer cloned = (StringTokenizer) super.clone();
445        if (cloned.chars != null) {
446            cloned.chars = cloned.chars.clone();
447        }
448        cloned.reset();
449        return cloned;
450    }
451
452    /**
453     * Gets the String content that the tokenizer is parsing.
454     *
455     * @return The string content being parsed.
456     */
457    public String getContent() {
458        if (chars == null) {
459            return null;
460        }
461        return new String(chars);
462    }
463
464    /**
465     * Gets the field delimiter matcher.
466     *
467     * @return The delimiter matcher in use.
468     */
469    public StringMatcher getDelimiterMatcher() {
470        return this.delimMatcher;
471    }
472
473    /**
474     * Gets the ignored character matcher.
475     * <p>
476     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
477     * </p>
478     *
479     * @return The ignored matcher in use.
480     */
481    public StringMatcher getIgnoredMatcher() {
482        return ignoredMatcher;
483    }
484
485    /**
486     * Gets the quote matcher currently in use.
487     * <p>
488     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
489     * </p>
490     *
491     * @return The quote matcher in use.
492     */
493    public StringMatcher getQuoteMatcher() {
494        return quoteMatcher;
495    }
496
497    /**
498     * Gets a copy of the full token list as an independent modifiable array.
499     *
500     * @return The tokens as a String array.
501     */
502    public String[] getTokenArray() {
503        checkTokenized();
504        return tokens.clone();
505    }
506
507    /**
508     * Gets a copy of the full token list as an independent modifiable list.
509     *
510     * @return The tokens as a String list.
511     */
512    public List<String> getTokenList() {
513        checkTokenized();
514        return new ArrayList<>(Arrays.asList(tokens));
515    }
516
517    /**
518     * Gets the trimmer character matcher.
519     * <p>
520     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
521     * </p>
522     *
523     * @return The trimmer matcher in use.
524     */
525    public StringMatcher getTrimmerMatcher() {
526        return trimmerMatcher;
527    }
528
529    /**
530     * Tests whether there are any more tokens.
531     *
532     * @return true if there are more tokens.
533     */
534    @Override
535    public boolean hasNext() {
536        checkTokenized();
537        return tokenPos < tokens.length;
538    }
539
540    /**
541     * Tests whether there are any previous tokens that can be iterated to.
542     *
543     * @return true if there are previous tokens.
544     */
545    @Override
546    public boolean hasPrevious() {
547        checkTokenized();
548        return tokenPos > 0;
549    }
550
551    /**
552     * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
553     *
554     * @return true if empty tokens are returned as null.
555     */
556    public boolean isEmptyTokenAsNull() {
557        return this.emptyAsNull;
558    }
559
560    /**
561     * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
562     *
563     * @return true if empty tokens are not returned.
564     */
565    public boolean isIgnoreEmptyTokens() {
566        return ignoreEmptyTokens;
567    }
568
569    /**
570     * Tests if the characters at the index specified match the quote already matched in readNextToken().
571     *
572     * @param srcChars   the character array being tokenized.
573     * @param pos        the position to check for a quote.
574     * @param len        the length of the character array being tokenized.
575     * @param quoteStart the start position of the matched quote, 0 if no quoting.
576     * @param quoteLen   the length of the matched quote, 0 if no quoting.
577     * @return true if a quote is matched.
578     */
579    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
580        for (int i = 0; i < quoteLen; i++) {
581            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
582                return false;
583            }
584        }
585        return true;
586    }
587
588    /**
589     * Gets the next token.
590     *
591     * @return The next String token.
592     * @throws NoSuchElementException if there are no more elements.
593     */
594    @Override
595    public String next() {
596        if (hasNext()) {
597            return tokens[tokenPos++];
598        }
599        throw new NoSuchElementException();
600    }
601
602    /**
603     * Gets the index of the next token to return.
604     *
605     * @return The next token index.
606     */
607    @Override
608    public int nextIndex() {
609        return tokenPos;
610    }
611
612    /**
613     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
614     * tokens remain.
615     *
616     * @return The next sequential token, or null when no more tokens are found.
617     */
618    public String nextToken() {
619        if (hasNext()) {
620            return tokens[tokenPos++];
621        }
622        return null;
623    }
624
625    /**
626     * Gets the token previous to the last returned token.
627     *
628     * @return The previous token.
629     */
630    @Override
631    public String previous() {
632        if (hasPrevious()) {
633            return tokens[--tokenPos];
634        }
635        throw new NoSuchElementException();
636    }
637
638    /**
639     * Gets the index of the previous token.
640     *
641     * @return The previous token index.
642     */
643    @Override
644    public int previousIndex() {
645        return tokenPos - 1;
646    }
647
648    /**
649     * Gets the previous token from the String.
650     *
651     * @return The previous sequential token, or null when no more tokens are found.
652     */
653    public String previousToken() {
654        if (hasPrevious()) {
655            return tokens[--tokenPos];
656        }
657        return null;
658    }
659
660    /**
661     * Reads character by character through the String to get the next token.
662     *
663     * @param srcChars  the character array being tokenized.
664     * @param start     the first character of field.
665     * @param len       the length of the character array being tokenized.
666     * @param workArea  a temporary work area.
667     * @param tokenList the list of parsed tokens.
668     * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
669     */
670    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
671            final List<String> tokenList) {
672        // skip all leading whitespace, unless it is the
673        // field delimiter or the quote character
674        while (start < len) {
675            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
676                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
677            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
678                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
679                break;
680            }
681            start += removeLen;
682        }
683
684        // handle reaching end
685        if (start >= len) {
686            addToken(tokenList, StringUtils.EMPTY);
687            return -1;
688        }
689
690        // handle empty token
691        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
692        if (delimLen > 0) {
693            addToken(tokenList, StringUtils.EMPTY);
694            return start + delimLen;
695        }
696
697        // handle found token
698        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
699        if (quoteLen > 0) {
700            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
701        }
702        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
703    }
704
705    /**
706     * Reads a possibly quoted string token.
707     *
708     * @param srcChars   the character array being tokenized.
709     * @param start      the first character of field.
710     * @param len        the length of the character array being tokenized.
711     * @param workArea   a temporary work area.
712     * @param tokenList  the list of parsed tokens.
713     * @param quoteStart the start position of the matched quote, 0 if no quoting.
714     * @param quoteLen   the length of the matched quote, 0 if no quoting.
715     * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
716     */
717    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
718            final List<String> tokenList, final int quoteStart, final int quoteLen) {
719        // Loop until we've found the end of the quoted
720        // string or the end of the input
721        workArea.clear();
722        int pos = start;
723        boolean quoting = quoteLen > 0;
724        int trimStart = 0;
725
726        while (pos < len) {
727            // quoting mode can occur several times throughout a string
728            // we must switch between quoting and non-quoting until we
729            // encounter a non-quoted delimiter, or end of string
730            if (quoting) {
731                // In quoting mode
732
733                // If we've found a quote character, see if it's
734                // followed by a second quote. If so, then we need
735                // to actually put the quote character into the token
736                // rather than end the token.
737                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
738                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
739                        // matched pair of quotes, thus an escaped quote
740                        workArea.append(srcChars, pos, quoteLen);
741                        pos += quoteLen * 2;
742                        trimStart = workArea.size();
743                        continue;
744                    }
745
746                    // end of quoting
747                    quoting = false;
748                    pos += quoteLen;
749                    continue;
750                }
751
752            } else {
753                // Not in quoting mode
754
755                // check for delimiter, and thus end of token
756                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
757                if (delimLen > 0) {
758                    // return condition when end of token found
759                    addToken(tokenList, workArea.substring(0, trimStart));
760                    return pos + delimLen;
761                }
762
763                // check for quote, and thus back into quoting mode
764                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
765                    quoting = true;
766                    pos += quoteLen;
767                    continue;
768                }
769
770                // check for ignored (outside quotes), and ignore
771                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
772                if (ignoredLen > 0) {
773                    pos += ignoredLen;
774                    continue;
775                }
776
777                // check for trimmed character
778                // don't yet know if its at the end, so copy to workArea
779                // use trimStart to keep track of trim at the end
780                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
781                if (trimmedLen > 0) {
782                    workArea.append(srcChars, pos, trimmedLen);
783                    pos += trimmedLen;
784                    continue;
785                }
786            }
787            // copy regular character from inside quotes
788            workArea.append(srcChars[pos++]);
789            trimStart = workArea.size();
790        }
791
792        // return condition when end of string found
793        addToken(tokenList, workArea.substring(0, trimStart));
794        return -1;
795    }
796
797    /**
798     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
799     *
800     * @throws UnsupportedOperationException Always thrown.
801     */
802    @Override
803    public void remove() {
804        throw new UnsupportedOperationException("remove() is unsupported");
805    }
806
807    /**
808     * Resets this tokenizer, forgetting all parsing and iteration already completed.
809     * <p>
810     * This method allows the same tokenizer to be reused for the same String.
811     * </p>
812     *
813     * @return {@code this} instance.
814     */
815    public StringTokenizer reset() {
816        tokenPos = 0;
817        tokens = null;
818        return this;
819    }
820
821    /**
822     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
823     *
824     * @param input the new character array to tokenize, not cloned, null sets no text to parse.
825     * @return {@code this} instance.
826     */
827    public StringTokenizer reset(final char[] input) {
828        reset();
829        this.chars = input != null ? input.clone() : null;
830        return this;
831    }
832
833    /**
834     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
835     *
836     * @param input the new string to tokenize, null sets no text to parse.
837     * @return {@code this} instance.
838     */
839    public StringTokenizer reset(final String input) {
840        reset();
841        this.chars = input != null ? input.toCharArray() : null;
842        return this;
843    }
844
845    /**
846     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
847     *
848     * @param obj this parameter ignored.
849     * @throws UnsupportedOperationException always.
850     */
851    @Override
852    public void set(final String obj) {
853        throw new UnsupportedOperationException("set() is unsupported");
854    }
855
856    /**
857     * Sets the field delimiter character.
858     *
859     * @param delim the delimiter character to use.
860     * @return {@code this} instance.
861     */
862    public StringTokenizer setDelimiterChar(final char delim) {
863        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
864    }
865
866    /**
867     * Sets the field delimiter matcher.
868     * <p>
869     * The delimiter is used to separate one token from another.
870     * </p>
871     *
872     * @param delim the delimiter matcher to use.
873     * @return {@code this} instance.
874     */
875    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
876        this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
877        return this;
878    }
879
880    /**
881     * Sets the field delimiter string.
882     *
883     * @param delim the delimiter string to use.
884     * @return {@code this} instance.
885     */
886    public StringTokenizer setDelimiterString(final String delim) {
887        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
888    }
889
890    /**
891     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
892     *
893     * @param emptyAsNull whether empty tokens are returned as null.
894     * @return {@code this} instance.
895     */
896    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
897        this.emptyAsNull = emptyAsNull;
898        return this;
899    }
900
901    /**
902     * Sets the character to ignore.
903     * <p>
904     * This character is ignored when parsing the String, unless it is within a quoted region.
905     * </p>
906     *
907     * @param ignored the ignored character to use.
908     * @return {@code this} instance.
909     */
910    public StringTokenizer setIgnoredChar(final char ignored) {
911        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
912    }
913
914    /**
915     * Sets the matcher for characters to ignore.
916     * <p>
917     * These characters are ignored when parsing the String, unless they are within a quoted region.
918     * </p>
919     *
920     * @param ignored the ignored matcher to use, null ignored.
921     * @return {@code this} instance.
922     */
923    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
924        if (ignored != null) {
925            this.ignoredMatcher = ignored;
926        }
927        return this;
928    }
929
930    /**
931     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
932     *
933     * @param ignoreEmptyTokens whether empty tokens are not returned.
934     * @return {@code this} instance.
935     */
936    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
937        this.ignoreEmptyTokens = ignoreEmptyTokens;
938        return this;
939    }
940
941    /**
942     * Sets the quote character to use.
943     * <p>
944     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
945     * </p>
946     *
947     * @param quote the quote character to use.
948     * @return {@code this} instance.
949     */
950    public StringTokenizer setQuoteChar(final char quote) {
951        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
952    }
953
954    /**
955     * Sets the quote matcher to use.
956     * <p>
957     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
958     * </p>
959     *
960     * @param quote the quote matcher to use, null ignored.
961     * @return {@code this} instance.
962     */
963    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
964        if (quote != null) {
965            this.quoteMatcher = quote;
966        }
967        return this;
968    }
969
970    /**
971     * Sets the matcher for characters to trim.
972     * <p>
973     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
974     *
975     * @param trimmer the trimmer matcher to use, null ignored.
976     * @return {@code this} instance.
977     */
978    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
979        if (trimmer != null) {
980            this.trimmerMatcher = trimmer;
981        }
982        return this;
983    }
984
985    /**
986     * Gets the number of tokens found in the String.
987     *
988     * @return The number of matched tokens.
989     */
990    public int size() {
991        checkTokenized();
992        return tokens.length;
993    }
994
995    /**
996     * Internal method to performs the tokenization.
997     * <p>
998     * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
999     * </p>
1000     * <p>
1001     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1002     * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1003     * </p>
1004     * <p>
1005     * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1006     * values, or even an entirely different array.
1007     * </p>
1008     *
1009     * @param srcChars the character array being tokenized, may be null.
1010     * @param offset   the start position within the character array, must be valid.
1011     * @param count    the number of characters to tokenize, must be valid.
1012     * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1013     */
1014    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1015        if (srcChars == null || count == 0) {
1016            return Collections.emptyList();
1017        }
1018        final TextStringBuilder buf = new TextStringBuilder();
1019        final List<String> tokenList = new ArrayList<>();
1020        int pos = offset;
1021        // loop around the entire buffer
1022        while (pos >= 0 && pos < count) {
1023            // find next token
1024            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1025            // handle case where end of string is a delimiter
1026            if (pos >= count) {
1027                addToken(tokenList, StringUtils.EMPTY);
1028            }
1029        }
1030        return tokenList;
1031    }
1032
1033    /**
1034     * Gets the String content that the tokenizer is parsing.
1035     *
1036     * @return The string content being parsed.
1037     */
1038    @Override
1039    public String toString() {
1040        if (tokens == null) {
1041            return "StringTokenizer[not tokenized yet]";
1042        }
1043        return "StringTokenizer" + getTokenList();
1044    }
1045}