001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025import java.util.StringTokenizer;
026
027import org.apache.commons.lang3.ArrayUtils;
028import org.apache.commons.lang3.StringUtils;
029
030/**
031 * Tokenizes a string based on delimiters (separators)
032 * and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims
035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
036 * however it offers much more control and flexibility including implementing
037 * the {@link ListIterator} interface. By default, it is set up
038 * like {@link StringTokenizer}.
039 * </p>
040 * <p>
041 * The input String is split into a number of <i>tokens</i>.
042 * Each token is separated from the next String by a <i>delimiter</i>.
043 * One or more delimiter characters must be specified.
044 * </p>
045 * <p>
046 * Each token may be surrounded by quotes.
047 * The <i>quote</i> matcher specifies the quote character(s).
048 * A quote may be escaped within a quoted section by duplicating itself.
049 * </p>
050 * <p>
051 * Between each token and the delimiter are potentially characters that need trimming.
052 * The <i>trimmer</i> matcher specifies these characters.
053 * One usage might be to trim whitespace characters.
054 * </p>
055 * <p>
056 * At any point outside the quotes there might potentially be invalid characters.
057 * The <i>ignored</i> matcher specifies these characters to be removed.
058 * One usage might be to remove new line characters.
059 * </p>
060 * <p>
061 * Empty tokens may be removed or returned as null.
062 * </p>
063 * <pre>
064 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
065 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
067 * </pre>
068 *
069 * <table>
070 *  <caption>StrTokenizer properties and options</caption>
071 *  <tr>
072 *   <th>Property</th><th>Type</th><th>Default</th>
073 *  </tr>
074 *  <tr>
075 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
082 *  </tr>
083 *  <tr>
084 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
085 *  </tr>
086 *  <tr>
087 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
088 *  </tr>
089 * </table>
090 *
091 * @since 2.2
092 * @deprecated As of 3.6, use Apache Commons Text
093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
094 * StringTokenizer</a> instead
095 */
096@Deprecated
097public class StrTokenizer implements ListIterator<String>, Cloneable {
098
099    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
100    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
101    static {
102        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
103        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
104        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
105        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
106        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
107        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
108        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
109
110        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
111        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
112        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
113        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
114        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
115        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
116        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
117    }
118
119    /**
120     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121     *
122     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123     */
124    private static StrTokenizer getCSVClone() {
125        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126    }
127    /**
128     * Gets a new tokenizer instance which parses Comma Separated Value strings
129     * initializing it with the given input.  The default for CSV processing
130     * will be trim whitespace from both ends (which can be overridden with
131     * the setTrimmer method).
132     * <p>
133     * You must call a "reset" method to set the string which you want to parse.
134     * </p>
135     * @return a new tokenizer instance which parses Comma Separated Value strings
136     */
137    public static StrTokenizer getCSVInstance() {
138        return getCSVClone();
139    }
140    /**
141     * Gets a new tokenizer instance which parses Comma Separated Value strings
142     * initializing it with the given input.  The default for CSV processing
143     * will be trim whitespace from both ends (which can be overridden with
144     * the setTrimmer method).
145     *
146     * @param input  the text to parse
147     * @return a new tokenizer instance which parses Comma Separated Value strings
148     */
149    public static StrTokenizer getCSVInstance(final char[] input) {
150        final StrTokenizer tok = getCSVClone();
151        tok.reset(input);
152        return tok;
153    }
154
155    /**
156     * Gets a new tokenizer instance which parses Comma Separated Value strings
157     * initializing it with the given input.  The default for CSV processing
158     * will be trim whitespace from both ends (which can be overridden with
159     * the setTrimmer method).
160     *
161     * @param input  the text to parse
162     * @return a new tokenizer instance which parses Comma Separated Value strings
163     */
164    public static StrTokenizer getCSVInstance(final String input) {
165        final StrTokenizer tok = getCSVClone();
166        tok.reset(input);
167        return tok;
168    }
169    /**
170     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171     *
172     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173     */
174    private static StrTokenizer getTSVClone() {
175        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176    }
177    /**
178     * Gets a new tokenizer instance which parses Tab Separated Value strings.
179     * The default for CSV processing will be trim whitespace from both ends
180     * (which can be overridden with the setTrimmer method).
181     * <p>
182     * You must call a "reset" method to set the string which you want to parse.
183     * </p>
184     * @return a new tokenizer instance which parses Tab Separated Value strings.
185     */
186    public static StrTokenizer getTSVInstance() {
187        return getTSVClone();
188    }
189    /**
190     * Gets a new tokenizer instance which parses Tab Separated Value strings.
191     * The default for CSV processing will be trim whitespace from both ends
192     * (which can be overridden with the setTrimmer method).
193     * @param input  the string to parse
194     * @return a new tokenizer instance which parses Tab Separated Value strings.
195     */
196    public static StrTokenizer getTSVInstance(final char[] input) {
197        final StrTokenizer tok = getTSVClone();
198        tok.reset(input);
199        return tok;
200    }
201
202    /**
203     * Gets a new tokenizer instance which parses Tab Separated Value strings.
204     * The default for CSV processing will be trim whitespace from both ends
205     * (which can be overridden with the setTrimmer method).
206     * @param input  the string to parse
207     * @return a new tokenizer instance which parses Tab Separated Value strings.
208     */
209    public static StrTokenizer getTSVInstance(final String input) {
210        final StrTokenizer tok = getTSVClone();
211        tok.reset(input);
212        return tok;
213    }
214    /** The text to work on. */
215    private char[] chars;
216
217    /** The parsed tokens */
218    private String[] tokens;
219
220    /** The current iteration position */
221    private int tokenPos;
222
223    /** The delimiter matcher */
224    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
225
226    /** The quote matcher */
227    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
228
229    /** The ignored matcher */
230    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
231
232    /** The trimmer matcher */
233    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
234
235    /** Whether to return empty tokens as null */
236    private boolean emptyAsNull;
237
238    /** Whether to ignore empty tokens */
239    private boolean ignoreEmptyTokens = true;
240
241    /**
242     * Constructs a tokenizer splitting on space, tab, newline and formfeed
243     * as per StringTokenizer, but with no text to tokenize.
244     * <p>
245     * This constructor is normally used with {@link #reset(String)}.
246     * </p>
247     */
248    public StrTokenizer() {
249        this.chars = null;
250    }
251
252    /**
253     * Constructs a tokenizer splitting on space, tab, newline and formfeed
254     * as per StringTokenizer.
255     *
256     * @param input  the string which is to be parsed, not cloned
257     */
258    public StrTokenizer(final char[] input) {
259        this.chars = ArrayUtils.clone(input);
260    }
261
262    /**
263     * Constructs a tokenizer splitting on the specified character.
264     *
265     * @param input  the string which is to be parsed, not cloned
266     * @param delim the field delimiter character
267     */
268    public StrTokenizer(final char[] input, final char delim) {
269        this(input);
270        setDelimiterChar(delim);
271    }
272
273    /**
274     * Constructs a tokenizer splitting on the specified delimiter character
275     * and handling quotes using the specified quote character.
276     *
277     * @param input  the string which is to be parsed, not cloned
278     * @param delim  the field delimiter character
279     * @param quote  the field quoted string character
280     */
281    public StrTokenizer(final char[] input, final char delim, final char quote) {
282        this(input, delim);
283        setQuoteChar(quote);
284    }
285
286    /**
287     * Constructs a tokenizer splitting on the specified string.
288     *
289     * @param input  the string which is to be parsed, not cloned
290     * @param delim the field delimiter string
291     */
292    public StrTokenizer(final char[] input, final String delim) {
293        this(input);
294        setDelimiterString(delim);
295    }
296
297    /**
298     * Constructs a tokenizer splitting using the specified delimiter matcher.
299     *
300     * @param input  the string which is to be parsed, not cloned
301     * @param delim  the field delimiter matcher
302     */
303    public StrTokenizer(final char[] input, final StrMatcher delim) {
304        this(input);
305        setDelimiterMatcher(delim);
306    }
307
308    /**
309     * Constructs a tokenizer splitting using the specified delimiter matcher
310     * and handling quotes using the specified quote matcher.
311     *
312     * @param input  the string which is to be parsed, not cloned
313     * @param delim  the field delimiter character
314     * @param quote  the field quoted string character
315     */
316    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
317        this(input, delim);
318        setQuoteMatcher(quote);
319    }
320
321    /**
322     * Constructs a tokenizer splitting on space, tab, newline and formfeed
323     * as per StringTokenizer.
324     *
325     * @param input  the string which is to be parsed
326     */
327    public StrTokenizer(final String input) {
328        if (input != null) {
329            chars = input.toCharArray();
330        } else {
331            chars = null;
332        }
333    }
334
335    /**
336     * Constructs a tokenizer splitting on the specified delimiter character.
337     *
338     * @param input  the string which is to be parsed
339     * @param delim  the field delimiter character
340     */
341    public StrTokenizer(final String input, final char delim) {
342        this(input);
343        setDelimiterChar(delim);
344    }
345
346    /**
347     * Constructs a tokenizer splitting on the specified delimiter character
348     * and handling quotes using the specified quote character.
349     *
350     * @param input  the string which is to be parsed
351     * @param delim  the field delimiter character
352     * @param quote  the field quoted string character
353     */
354    public StrTokenizer(final String input, final char delim, final char quote) {
355        this(input, delim);
356        setQuoteChar(quote);
357    }
358
359    /**
360     * Constructs a tokenizer splitting on the specified delimiter string.
361     *
362     * @param input  the string which is to be parsed
363     * @param delim  the field delimiter string
364     */
365    public StrTokenizer(final String input, final String delim) {
366        this(input);
367        setDelimiterString(delim);
368    }
369
370    /**
371     * Constructs a tokenizer splitting using the specified delimiter matcher.
372     *
373     * @param input  the string which is to be parsed
374     * @param delim  the field delimiter matcher
375     */
376    public StrTokenizer(final String input, final StrMatcher delim) {
377        this(input);
378        setDelimiterMatcher(delim);
379    }
380
381    /**
382     * Constructs a tokenizer splitting using the specified delimiter matcher
383     * and handling quotes using the specified quote matcher.
384     *
385     * @param input  the string which is to be parsed
386     * @param delim  the field delimiter matcher
387     * @param quote  the field quoted string matcher
388     */
389    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
390        this(input, delim);
391        setQuoteMatcher(quote);
392    }
393
394    /**
395     * Unsupported ListIterator operation.
396     * @param obj this parameter ignored.
397     * @throws UnsupportedOperationException always
398     */
399    @Override
400    public void add(final String obj) {
401        throw new UnsupportedOperationException("add() is unsupported");
402    }
403
404    /**
405     * Adds a token to a list, paying attention to the parameters we've set.
406     *
407     * @param list  the list to add to
408     * @param tok  the token to add
409     */
410    private void addToken(final List<String> list, String tok) {
411        if (StringUtils.isEmpty(tok)) {
412            if (isIgnoreEmptyTokens()) {
413                return;
414            }
415            if (isEmptyTokenAsNull()) {
416                tok = null;
417            }
418        }
419        list.add(tok);
420    }
421
422    /**
423     * Checks if tokenization has been done, and if not then do it.
424     */
425    private void checkTokenized() {
426        if (tokens == null) {
427            if (chars == null) {
428                // still call tokenize as subclass may do some work
429                final List<String> split = tokenize(null, 0, 0);
430                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
431            } else {
432                final List<String> split = tokenize(chars, 0, chars.length);
433                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
434            }
435        }
436    }
437
438    /**
439     * Creates a new instance of this Tokenizer. The new instance is reset so
440     * that it will be at the start of the token list.
441     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
442     *
443     * @return a new instance of this Tokenizer which has been reset.
444     */
445    @Override
446    public Object clone() {
447        try {
448            return cloneReset();
449        } catch (final CloneNotSupportedException ex) {
450            return null;
451        }
452    }
453
454    /**
455     * Creates a new instance of this Tokenizer. The new instance is reset so that
456     * it will be at the start of the token list.
457     *
458     * @return a new instance of this Tokenizer which has been reset.
459     * @throws CloneNotSupportedException if there is a problem cloning
460     */
461    Object cloneReset() throws CloneNotSupportedException {
462        // this method exists to enable 100% test coverage
463        final StrTokenizer cloned = (StrTokenizer) super.clone();
464        if (cloned.chars != null) {
465            cloned.chars = cloned.chars.clone();
466        }
467        cloned.reset();
468        return cloned;
469    }
470
471    /**
472     * Gets the String content that the tokenizer is parsing.
473     *
474     * @return the string content being parsed
475     */
476    public String getContent() {
477        if (chars == null) {
478            return null;
479        }
480        return new String(chars);
481    }
482
483    /**
484     * Gets the field delimiter matcher.
485     *
486     * @return the delimiter matcher in use
487     */
488    public StrMatcher getDelimiterMatcher() {
489        return this.delimMatcher;
490    }
491
492    // Ignored
493    /**
494     * Gets the ignored character matcher.
495     * <p>
496     * These characters are ignored when parsing the String, unless they are
497     * within a quoted region.
498     * The default value is not to ignore anything.
499     * </p>
500     *
501     * @return the ignored matcher in use
502     */
503    public StrMatcher getIgnoredMatcher() {
504        return ignoredMatcher;
505    }
506
507    /**
508     * Gets the quote matcher currently in use.
509     * <p>
510     * The quote character is used to wrap data between the tokens.
511     * This enables delimiters to be entered as data.
512     * The default value is '"' (double quote).
513     * </p>
514     *
515     * @return the quote matcher in use
516     */
517    public StrMatcher getQuoteMatcher() {
518        return quoteMatcher;
519    }
520
521    /**
522     * Gets a copy of the full token list as an independent modifiable array.
523     *
524     * @return the tokens as a String array
525     */
526    public String[] getTokenArray() {
527        checkTokenized();
528        return tokens.clone();
529    }
530
531    /**
532     * Gets a copy of the full token list as an independent modifiable list.
533     *
534     * @return the tokens as a String array
535     */
536    public List<String> getTokenList() {
537        checkTokenized();
538        final List<String> list = new ArrayList<>(tokens.length);
539        list.addAll(Arrays.asList(tokens));
540        return list;
541    }
542
543    /**
544     * Gets the trimmer character matcher.
545     * <p>
546     * These characters are trimmed off on each side of the delimiter
547     * until the token or quote is found.
548     * The default value is not to trim anything.
549     * </p>
550     *
551     * @return the trimmer matcher in use
552     */
553    public StrMatcher getTrimmerMatcher() {
554        return trimmerMatcher;
555    }
556
557    /**
558     * Checks whether there are any more tokens.
559     *
560     * @return true if there are more tokens
561     */
562    @Override
563    public boolean hasNext() {
564        checkTokenized();
565        return tokenPos < tokens.length;
566    }
567
568    /**
569     * Checks whether there are any previous tokens that can be iterated to.
570     *
571     * @return true if there are previous tokens
572     */
573    @Override
574    public boolean hasPrevious() {
575        checkTokenized();
576        return tokenPos > 0;
577    }
578
579    /**
580     * Gets whether the tokenizer currently returns empty tokens as null.
581     * The default for this property is false.
582     *
583     * @return true if empty tokens are returned as null
584     */
585    public boolean isEmptyTokenAsNull() {
586        return this.emptyAsNull;
587    }
588
589    /**
590     * Gets whether the tokenizer currently ignores empty tokens.
591     * The default for this property is true.
592     *
593     * @return true if empty tokens are not returned
594     */
595    public boolean isIgnoreEmptyTokens() {
596        return ignoreEmptyTokens;
597    }
598
599    /**
600     * Checks if the characters at the index specified match the quote
601     * already matched in readNextToken().
602     *
603     * @param srcChars  the character array being tokenized
604     * @param pos  the position to check for a quote
605     * @param len  the length of the character array being tokenized
606     * @param quoteStart  the start position of the matched quote, 0 if no quoting
607     * @param quoteLen  the length of the matched quote, 0 if no quoting
608     * @return true if a quote is matched
609     */
610    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
611        for (int i = 0; i < quoteLen; i++) {
612            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
613                return false;
614            }
615        }
616        return true;
617    }
618
619    /**
620     * Gets the next token.
621     *
622     * @return the next String token
623     * @throws NoSuchElementException if there are no more elements
624     */
625    @Override
626    public String next() {
627        if (hasNext()) {
628            return tokens[tokenPos++];
629        }
630        throw new NoSuchElementException();
631    }
632
633    /**
634     * Gets the index of the next token to return.
635     *
636     * @return the next token index
637     */
638    @Override
639    public int nextIndex() {
640        return tokenPos;
641    }
642
643    /**
644     * Gets the next token from the String.
645     * Equivalent to {@link #next()} except it returns null rather than
646     * throwing {@link NoSuchElementException} when no tokens remain.
647     *
648     * @return the next sequential token, or null when no more tokens are found
649     */
650    public String nextToken() {
651        if (hasNext()) {
652            return tokens[tokenPos++];
653        }
654        return null;
655    }
656
657    /**
658     * Gets the token previous to the last returned token.
659     *
660     * @return the previous token
661     */
662    @Override
663    public String previous() {
664        if (hasPrevious()) {
665            return tokens[--tokenPos];
666        }
667        throw new NoSuchElementException();
668    }
669
670    /**
671     * Gets the index of the previous token.
672     *
673     * @return the previous token index
674     */
675    @Override
676    public int previousIndex() {
677        return tokenPos - 1;
678    }
679
680    /**
681     * Gets the previous token from the String.
682     *
683     * @return the previous sequential token, or null when no more tokens are found
684     */
685    public String previousToken() {
686        if (hasPrevious()) {
687            return tokens[--tokenPos];
688        }
689        return null;
690    }
691
692    /**
693     * Reads character by character through the String to get the next token.
694     *
695     * @param srcChars  the character array being tokenized
696     * @param start  the first character of field
697     * @param len  the length of the character array being tokenized
698     * @param workArea  a temporary work area
699     * @param tokenList  the list of parsed tokens
700     * @return the starting position of the next field (the character
701     *  immediately after the delimiter), or -1 if end of string found
702     */
703    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
704        // skip all leading whitespace, unless it is the
705        // field delimiter or the quote character
706        while (start < len) {
707            final int removeLen = Math.max(
708                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
709                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
710            if (removeLen == 0 ||
711                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
712                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
713                break;
714            }
715            start += removeLen;
716        }
717
718        // handle reaching end
719        if (start >= len) {
720            addToken(tokenList, StringUtils.EMPTY);
721            return -1;
722        }
723
724        // handle empty token
725        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
726        if (delimLen > 0) {
727            addToken(tokenList, StringUtils.EMPTY);
728            return start + delimLen;
729        }
730
731        // handle found token
732        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
733        if (quoteLen > 0) {
734            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
735        }
736        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
737    }
738
739    /**
740     * Reads a possibly quoted string token.
741     *
742     * @param srcChars  the character array being tokenized
743     * @param start  the first character of field
744     * @param len  the length of the character array being tokenized
745     * @param workArea  a temporary work area
746     * @param tokenList  the list of parsed tokens
747     * @param quoteStart  the start position of the matched quote, 0 if no quoting
748     * @param quoteLen  the length of the matched quote, 0 if no quoting
749     * @return the starting position of the next field (the character
750     *  immediately after the delimiter, or if end of string found,
751     *  then the length of string
752     */
753    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
754                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
755        // Loop until we've found the end of the quoted
756        // string or the end of the input
757        workArea.clear();
758        int pos = start;
759        boolean quoting = quoteLen > 0;
760        int trimStart = 0;
761
762        while (pos < len) {
763            // quoting mode can occur several times throughout a string
764            // we must switch between quoting and non-quoting until we
765            // encounter a non-quoted delimiter, or end of string
766            if (quoting) {
767                // In quoting mode
768
769                // If we've found a quote character, see if it's
770                // followed by a second quote.  If so, then we need
771                // to actually put the quote character into the token
772                // rather than end the token.
773                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
774                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
775                        // matched pair of quotes, thus an escaped quote
776                        workArea.append(srcChars, pos, quoteLen);
777                        pos += quoteLen * 2;
778                        trimStart = workArea.size();
779                        continue;
780                    }
781
782                    // end of quoting
783                    quoting = false;
784                    pos += quoteLen;
785                    continue;
786                }
787
788            } else {
789                // Not in quoting mode
790
791                // check for delimiter, and thus end of token
792                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
793                if (delimLen > 0) {
794                    // return condition when end of token found
795                    addToken(tokenList, workArea.substring(0, trimStart));
796                    return pos + delimLen;
797                }
798
799                // check for quote, and thus back into quoting mode
800                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
801                    quoting = true;
802                    pos += quoteLen;
803                    continue;
804                }
805
806                // check for ignored (outside quotes), and ignore
807                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
808                if (ignoredLen > 0) {
809                    pos += ignoredLen;
810                    continue;
811                }
812
813                // check for trimmed character
814                // don't yet know if it's at the end, so copy to workArea
815                // use trimStart to keep track of trim at the end
816                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
817                if (trimmedLen > 0) {
818                    workArea.append(srcChars, pos, trimmedLen);
819                    pos += trimmedLen;
820                    continue;
821                }
822            }
823            // copy regular character from inside quotes
824            workArea.append(srcChars[pos++]);
825            trimStart = workArea.size();
826        }
827
828        // return condition when end of string found
829        addToken(tokenList, workArea.substring(0, trimStart));
830        return -1;
831    }
832
833    /**
834     * Unsupported ListIterator operation.
835     *
836     * @throws UnsupportedOperationException always
837     */
838    @Override
839    public void remove() {
840        throw new UnsupportedOperationException("remove() is unsupported");
841    }
842
843    /**
844     * Resets this tokenizer, forgetting all parsing and iteration already completed.
845     * <p>
846     * This method allows the same tokenizer to be reused for the same String.
847     * </p>
848     *
849     * @return this, to enable chaining
850     */
851    public StrTokenizer reset() {
852        tokenPos = 0;
853        tokens = null;
854        return this;
855    }
856
857    /**
858     * Reset this tokenizer, giving it a new input string to parse.
859     * In this manner you can re-use a tokenizer with the same settings
860     * on multiple input lines.
861     *
862     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
863     * @return this, to enable chaining
864     */
865    public StrTokenizer reset(final char[] input) {
866        reset();
867        this.chars = ArrayUtils.clone(input);
868        return this;
869    }
870
871    /**
872     * Reset this tokenizer, giving it a new input string to parse.
873     * In this manner you can re-use a tokenizer with the same settings
874     * on multiple input lines.
875     *
876     * @param input  the new string to tokenize, null sets no text to parse
877     * @return this, to enable chaining
878     */
879    public StrTokenizer reset(final String input) {
880        reset();
881        if (input != null) {
882            this.chars = input.toCharArray();
883        } else {
884            this.chars = null;
885        }
886        return this;
887    }
888
889    /**
890     * Unsupported ListIterator operation.
891     * @param obj this parameter ignored.
892     * @throws UnsupportedOperationException always
893     */
894    @Override
895    public void set(final String obj) {
896        throw new UnsupportedOperationException("set() is unsupported");
897    }
898
899    /**
900     * Sets the field delimiter character.
901     *
902     * @param delim  the delimiter character to use
903     * @return this, to enable chaining
904     */
905    public StrTokenizer setDelimiterChar(final char delim) {
906        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
907    }
908
909    /**
910     * Sets the field delimiter matcher.
911     * <p>
912     * The delimiter is used to separate one token from another.
913     * </p>
914     *
915     * @param delim  the delimiter matcher to use
916     * @return this, to enable chaining
917     */
918    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
919        if (delim == null) {
920            this.delimMatcher = StrMatcher.noneMatcher();
921        } else {
922            this.delimMatcher = delim;
923        }
924        return this;
925    }
926
927    /**
928     * Sets the field delimiter string.
929     *
930     * @param delim  the delimiter string to use
931     * @return this, to enable chaining
932     */
933    public StrTokenizer setDelimiterString(final String delim) {
934        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
935    }
936
937    /**
938     * Sets whether the tokenizer should return empty tokens as null.
939     * The default for this property is false.
940     *
941     * @param emptyAsNull  whether empty tokens are returned as null
942     * @return this, to enable chaining
943     */
944    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
945        this.emptyAsNull = emptyAsNull;
946        return this;
947    }
948
949    /**
950     * Sets the character to ignore.
951     * <p>
952     * This character is ignored when parsing the String, unless it is
953     * within a quoted region.
954     *
955     * @param ignored  the ignored character to use
956     * @return this, to enable chaining
957     */
958    public StrTokenizer setIgnoredChar(final char ignored) {
959        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
960    }
961
962    /**
963     * Sets the matcher for characters to ignore.
964     * <p>
965     * These characters are ignored when parsing the String, unless they are
966     * within a quoted region.
967     * </p>
968     *
969     * @param ignored  the ignored matcher to use, null ignored
970     * @return this, to enable chaining
971     */
972    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
973        if (ignored != null) {
974            this.ignoredMatcher = ignored;
975        }
976        return this;
977    }
978
979    /**
980     * Sets whether the tokenizer should ignore and not return empty tokens.
981     * The default for this property is true.
982     *
983     * @param ignoreEmptyTokens  whether empty tokens are not returned
984     * @return this, to enable chaining
985     */
986    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
987        this.ignoreEmptyTokens = ignoreEmptyTokens;
988        return this;
989    }
990
991    /**
992     * Sets the quote character to use.
993     * <p>
994     * The quote character is used to wrap data between the tokens.
995     * This enables delimiters to be entered as data.
996     * </p>
997     *
998     * @param quote  the quote character to use
999     * @return this, to enable chaining
1000     */
1001    public StrTokenizer setQuoteChar(final char quote) {
1002        return setQuoteMatcher(StrMatcher.charMatcher(quote));
1003    }
1004
1005    /**
1006     * Sets the quote matcher to use.
1007     * <p>
1008     * The quote character is used to wrap data between the tokens.
1009     * This enables delimiters to be entered as data.
1010     * </p>
1011     *
1012     * @param quote  the quote matcher to use, null ignored
1013     * @return this, to enable chaining
1014     */
1015    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1016        if (quote != null) {
1017            this.quoteMatcher = quote;
1018        }
1019        return this;
1020    }
1021
1022    /**
1023     * Sets the matcher for characters to trim.
1024     * <p>
1025     * These characters are trimmed off on each side of the delimiter
1026     * until the token or quote is found.
1027     * </p>
1028     *
1029     * @param trimmer  the trimmer matcher to use, null ignored
1030     * @return this, to enable chaining
1031     */
1032    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1033        if (trimmer != null) {
1034            this.trimmerMatcher = trimmer;
1035        }
1036        return this;
1037    }
1038
1039    // API
1040    /**
1041     * Gets the number of tokens found in the String.
1042     *
1043     * @return the number of matched tokens
1044     */
1045    public int size() {
1046        checkTokenized();
1047        return tokens.length;
1048    }
1049
1050    /**
1051     * Internal method to performs the tokenization.
1052     * <p>
1053     * Most users of this class do not need to call this method. This method
1054     * will be called automatically by other (public) methods when required.
1055     * </p>
1056     * <p>
1057     * This method exists to allow subclasses to add code before or after the
1058     * tokenization. For example, a subclass could alter the character array,
1059     * offset or count to be parsed, or call the tokenizer multiple times on
1060     * multiple strings. It is also be possible to filter the results.
1061     * </p>
1062     * <p>
1063     * {@link StrTokenizer} will always pass a zero offset and a count
1064     * equal to the length of the array to this method, however a subclass
1065     * may pass other values, or even an entirely different array.
1066     * </p>
1067     *
1068     * @param srcChars  the character array being tokenized, may be null
1069     * @param offset  the start position within the character array, must be valid
1070     * @param count  the number of characters to tokenize, must be valid
1071     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
1072     */
1073    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1074        if (ArrayUtils.isEmpty(srcChars)) {
1075            return Collections.emptyList();
1076        }
1077        final StrBuilder buf = new StrBuilder();
1078        final List<String> tokenList = new ArrayList<>();
1079        int pos = offset;
1080
1081        // loop around the entire buffer
1082        while (pos >= 0 && pos < count) {
1083            // find next token
1084            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1085
1086            // handle case where end of string is a delimiter
1087            if (pos >= count) {
1088                addToken(tokenList, StringUtils.EMPTY);
1089            }
1090        }
1091        return tokenList;
1092    }
1093
1094    /**
1095     * Gets the String content that the tokenizer is parsing.
1096     *
1097     * @return the string content being parsed
1098     */
1099    @Override
1100    public String toString() {
1101        if (tokens == null) {
1102            return "StrTokenizer[not tokenized yet]";
1103        }
1104        return "StrTokenizer" + getTokenList();
1105    }
1106
1107}