001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.ArrayUtils;
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * Tokenizes a string based on delimiters (separators)
030 * and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims
033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
034 * however it offers much more control and flexibility including implementing
035 * the {@code ListIterator} interface. By default, it is set up
036 * like {@code StringTokenizer}.
037 * <p>
038 * The input String is split into a number of <em>tokens</em>.
039 * Each token is separated from the next String by a <em>delimiter</em>.
040 * One or more delimiter characters must be specified.
041 * <p>
042 * Each token may be surrounded by quotes.
043 * The <em>quote</em> matcher specifies the quote character(s).
044 * A quote may be escaped within a quoted section by duplicating itself.
045 * <p>
046 * Between each token and the delimiter are potentially characters that need trimming.
047 * The <em>trimmer</em> matcher specifies these characters.
048 * One usage might be to trim whitespace characters.
049 * <p>
050 * At any point outside the quotes there might potentially be invalid characters.
051 * The <em>ignored</em> matcher specifies these characters to be removed.
052 * One usage might be to remove new line characters.
053 * <p>
054 * Empty tokens may be removed or returned as null.
055 * <pre>
056 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
057 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
059 * </pre>
060 *
061 * <table>
062 *  <caption>StrTokenizer properties and options</caption>
063 *  <tr>
064 *   <th>Property</th><th>Type</th><th>Default</th>
065 *  </tr>
066 *  <tr>
067 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
068 *  </tr>
069 *  <tr>
070 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
071 *  </tr>
072 *  <tr>
073 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
074 *  </tr>
075 *  <tr>
076 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
077 *  </tr>
078 *  <tr>
079 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
080 *  </tr>
081 * </table>
082 *
083 * @since 1.0
084 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
085 */
086@Deprecated
087public class StrTokenizer implements ListIterator<String>, Cloneable {
088
089    /** Comma separated values tokenizer internal variable. */
090    // @formatter:off
091    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
092            .setDelimiterMatcher(StrMatcher.commaMatcher())
093            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
094            .setIgnoredMatcher(StrMatcher.noneMatcher())
095            .setTrimmerMatcher(StrMatcher.trimMatcher())
096            .setEmptyTokenAsNull(false)
097            .setIgnoreEmptyTokens(false);
098    // @formatter:on
099
100    /** Tab separated values tokenizer internal variable. */
101    // @formatter:off
102    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103            .setDelimiterMatcher(StrMatcher.tabMatcher())
104            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105            .setIgnoredMatcher(StrMatcher.noneMatcher())
106            .setTrimmerMatcher(StrMatcher.trimMatcher())
107            .setEmptyTokenAsNull(false)
108            .setIgnoreEmptyTokens(false);
109    // @formatter:on
110
111    /**
112     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113     *
114     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115     */
116    private static StrTokenizer getCSVClone() {
117        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118    }
119
120    /**
121     * Gets a new tokenizer instance which parses Comma Separated Value strings
122     * initializing it with the given input.  The default for CSV processing
123     * will be trim whitespace from both ends (which can be overridden with
124     * the setTrimmer method).
125     * <p>
126     * You must call a "reset" method to set the string which you want to parse.
127     * </p>
128     * @return a new tokenizer instance which parses Comma Separated Value strings
129     */
130    public static StrTokenizer getCSVInstance() {
131        return getCSVClone();
132    }
133
134    /**
135     * Gets a new tokenizer instance which parses Comma Separated Value strings
136     * initializing it with the given input.  The default for CSV processing
137     * will be trim whitespace from both ends (which can be overridden with
138     * the setTrimmer method).
139     *
140     * @param input  the text to parse
141     * @return a new tokenizer instance which parses Comma Separated Value strings
142     */
143    public static StrTokenizer getCSVInstance(final char[] input) {
144        final StrTokenizer tok = getCSVClone();
145        tok.reset(input);
146        return tok;
147    }
148
149    /**
150     * Gets a new tokenizer instance which parses Comma Separated Value strings
151     * initializing it with the given input.  The default for CSV processing
152     * will be trim whitespace from both ends (which can be overridden with
153     * the setTrimmer method).
154     *
155     * @param input  the text to parse
156     * @return a new tokenizer instance which parses Comma Separated Value strings
157     */
158    public static StrTokenizer getCSVInstance(final String input) {
159        final StrTokenizer tok = getCSVClone();
160        tok.reset(input);
161        return tok;
162    }
163    /**
164     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
165     *
166     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167     */
168    private static StrTokenizer getTSVClone() {
169        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
170    }
171
172    /**
173     * Gets a new tokenizer instance which parses Tab Separated Value strings.
174     * The default for CSV processing will be trim whitespace from both ends
175     * (which can be overridden with the setTrimmer method).
176     * <p>
177     * You must call a "reset" method to set the string which you want to parse.
178     * </p>
179     * @return a new tokenizer instance which parses Tab Separated Value strings.
180     */
181    public static StrTokenizer getTSVInstance() {
182        return getTSVClone();
183    }
184
185    /**
186     * Gets a new tokenizer instance which parses Tab Separated Value strings.
187     * The default for CSV processing will be trim whitespace from both ends
188     * (which can be overridden with the setTrimmer method).
189     * @param input  the string to parse
190     * @return a new tokenizer instance which parses Tab Separated Value strings.
191     */
192    public static StrTokenizer getTSVInstance(final char[] input) {
193        final StrTokenizer tok = getTSVClone();
194        tok.reset(input);
195        return tok;
196    }
197
198    /**
199     * Gets a new tokenizer instance which parses Tab Separated Value strings.
200     * The default for CSV processing will be trim whitespace from both ends
201     * (which can be overridden with the setTrimmer method).
202     * @param input  the string to parse
203     * @return a new tokenizer instance which parses Tab Separated Value strings.
204     */
205    public static StrTokenizer getTSVInstance(final String input) {
206        final StrTokenizer tok = getTSVClone();
207        tok.reset(input);
208        return tok;
209    }
210
211    /** The text to work on. */
212    private char[] chars;
213
214    /** The parsed tokens. */
215    private String[] tokens;
216
217    /** The current iteration position. */
218    private int tokenPos;
219
220    /** The delimiter matcher. */
221    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
222
223    /** The quote matcher. */
224    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
225
226    /** The ignored matcher. */
227    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
228
229    /** The trimmer matcher. */
230    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
231
232    /** Whether to return empty tokens as null. */
233    private boolean emptyAsNull;
234
235    /** Whether to ignore empty tokens. */
236    private boolean ignoreEmptyTokens = true;
237
238    /**
239     * Constructs a tokenizer splitting on space, tab, newline and form feed
240     * as per StringTokenizer, but with no text to tokenize.
241     * <p>
242     * This constructor is normally used with {@link #reset(String)}.
243     * </p>
244     */
245    public StrTokenizer() {
246        this.chars = null;
247    }
248
249    /**
250     * Constructs a tokenizer splitting on space, tab, newline and form feed
251     * as per StringTokenizer.
252     *
253     * @param input  the string which is to be parsed, not cloned
254     */
255    public StrTokenizer(final char[] input) {
256        if (input == null) {
257            this.chars = null;
258        } else {
259            this.chars = input.clone();
260        }
261    }
262
263    /**
264     * Constructs a tokenizer splitting on the specified character.
265     *
266     * @param input  the string which is to be parsed, not cloned
267     * @param delim the field delimiter character
268     */
269    public StrTokenizer(final char[] input, final char delim) {
270        this(input);
271        setDelimiterChar(delim);
272    }
273
274    /**
275     * Constructs a tokenizer splitting on the specified delimiter character
276     * and handling quotes using the specified quote character.
277     *
278     * @param input  the string which is to be parsed, not cloned
279     * @param delim  the field delimiter character
280     * @param quote  the field quoted string character
281     */
282    public StrTokenizer(final char[] input, final char delim, final char quote) {
283        this(input, delim);
284        setQuoteChar(quote);
285    }
286
287    /**
288     * Constructs a tokenizer splitting on the specified string.
289     *
290     * @param input  the string which is to be parsed, not cloned
291     * @param delim the field delimiter string
292     */
293    public StrTokenizer(final char[] input, final String delim) {
294        this(input);
295        setDelimiterString(delim);
296    }
297
298    /**
299     * Constructs a tokenizer splitting using the specified delimiter matcher.
300     *
301     * @param input  the string which is to be parsed, not cloned
302     * @param delim  the field delimiter matcher
303     */
304    public StrTokenizer(final char[] input, final StrMatcher delim) {
305        this(input);
306        setDelimiterMatcher(delim);
307    }
308
309    /**
310     * Constructs a tokenizer splitting using the specified delimiter matcher
311     * and handling quotes using the specified quote matcher.
312     *
313     * @param input  the string which is to be parsed, not cloned
314     * @param delim  the field delimiter character
315     * @param quote  the field quoted string character
316     */
317    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
318        this(input, delim);
319        setQuoteMatcher(quote);
320    }
321
322    /**
323     * Constructs a tokenizer splitting on space, tab, newline and form feed
324     * as per StringTokenizer.
325     *
326     * @param input  the string which is to be parsed
327     */
328    public StrTokenizer(final String input) {
329        if (input != null) {
330            chars = input.toCharArray();
331        } else {
332            chars = null;
333        }
334    }
335
336    /**
337     * Constructs a tokenizer splitting on the specified delimiter character.
338     *
339     * @param input  the string which is to be parsed
340     * @param delim  the field delimiter character
341     */
342    public StrTokenizer(final String input, final char delim) {
343        this(input);
344        setDelimiterChar(delim);
345    }
346
347    /**
348     * Constructs a tokenizer splitting on the specified delimiter character
349     * and handling quotes using the specified quote character.
350     *
351     * @param input  the string which is to be parsed
352     * @param delim  the field delimiter character
353     * @param quote  the field quoted string character
354     */
355    public StrTokenizer(final String input, final char delim, final char quote) {
356        this(input, delim);
357        setQuoteChar(quote);
358    }
359
360    /**
361     * Constructs a tokenizer splitting on the specified delimiter string.
362     *
363     * @param input  the string which is to be parsed
364     * @param delim  the field delimiter string
365     */
366    public StrTokenizer(final String input, final String delim) {
367        this(input);
368        setDelimiterString(delim);
369    }
370
371    /**
372     * Constructs a tokenizer splitting using the specified delimiter matcher.
373     *
374     * @param input  the string which is to be parsed
375     * @param delim  the field delimiter matcher
376     */
377    public StrTokenizer(final String input, final StrMatcher delim) {
378        this(input);
379        setDelimiterMatcher(delim);
380    }
381
382    /**
383     * Constructs a tokenizer splitting using the specified delimiter matcher
384     * and handling quotes using the specified quote matcher.
385     *
386     * @param input  the string which is to be parsed
387     * @param delim  the field delimiter matcher
388     * @param quote  the field quoted string matcher
389     */
390    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
391        this(input, delim);
392        setQuoteMatcher(quote);
393    }
394
395    /**
396     * Unsupported ListIterator operation.
397     * @param obj this parameter ignored.
398     * @throws UnsupportedOperationException always
399     */
400    @Override
401    public void add(final String obj) {
402        throw new UnsupportedOperationException("add() is unsupported");
403    }
404
405    /**
406     * Adds a token to a list, paying attention to the parameters we've set.
407     *
408     * @param list  the list to add to
409     * @param tok  the token to add
410     */
411    private void addToken(final List<String> list, String tok) {
412        if (tok == null || tok.isEmpty()) {
413            if (isIgnoreEmptyTokens()) {
414                return;
415            }
416            if (isEmptyTokenAsNull()) {
417                tok = null;
418            }
419        }
420        list.add(tok);
421    }
422
423    /**
424     * Checks if tokenization has been done, and if not then do it.
425     */
426    private void checkTokenized() {
427        if (tokens == null) {
428            if (chars == null) {
429                // still call tokenize as subclass may do some work
430                final List<String> split = tokenize(null, 0, 0);
431                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
432            } else {
433                final List<String> split = tokenize(chars, 0, chars.length);
434                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
435            }
436        }
437    }
438
439    /**
440     * Creates a new instance of this Tokenizer. The new instance is reset so
441     * that it will be at the start of the token list.
442     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
443     *
444     * @return a new instance of this Tokenizer which has been reset.
445     */
446    @Override
447    public Object clone() {
448        try {
449            return cloneReset();
450        } catch (final CloneNotSupportedException ex) {
451            return null;
452        }
453    }
454
455    /**
456     * Creates a new instance of this Tokenizer. The new instance is reset so that
457     * it will be at the start of the token list.
458     *
459     * @return a new instance of this Tokenizer which has been reset.
460     * @throws CloneNotSupportedException if there is a problem cloning
461     */
462    Object cloneReset() throws CloneNotSupportedException {
463        // this method exists to enable 100% test coverage
464        final StrTokenizer cloned = (StrTokenizer) super.clone();
465        if (cloned.chars != null) {
466            cloned.chars = cloned.chars.clone();
467        }
468        cloned.reset();
469        return cloned;
470    }
471
472    /**
473     * Gets the String content that the tokenizer is parsing.
474     *
475     * @return The string content being parsed
476     */
477    public String getContent() {
478        if (chars == null) {
479            return null;
480        }
481        return new String(chars);
482    }
483
484    /**
485     * Gets the field delimiter matcher.
486     *
487     * @return The delimiter matcher in use
488     */
489    public StrMatcher getDelimiterMatcher() {
490        return this.delimMatcher;
491    }
492
493    /**
494     * Gets the ignored character matcher.
495     * <p>
496     * These characters are ignored when parsing the String, unless they are
497     * within a quoted region.
498     * The default value is not to ignore anything.
499     * </p>
500     *
501     * @return The ignored matcher in use
502     */
503    public StrMatcher getIgnoredMatcher() {
504        return ignoredMatcher;
505    }
506
507    /**
508     * Gets the quote matcher currently in use.
509     * <p>
510     * The quote character is used to wrap data between the tokens.
511     * This enables delimiters to be entered as data.
512     * The default value is '"' (double quote).
513     * </p>
514     *
515     * @return The quote matcher in use
516     */
517    public StrMatcher getQuoteMatcher() {
518        return quoteMatcher;
519    }
520
521    /**
522     * Gets a copy of the full token list as an independent modifiable array.
523     *
524     * @return The tokens as a String array
525     */
526    public String[] getTokenArray() {
527        checkTokenized();
528        return tokens.clone();
529    }
530
531    /**
532     * Gets a copy of the full token list as an independent modifiable list.
533     *
534     * @return The tokens as a String array
535     */
536    public List<String> getTokenList() {
537        checkTokenized();
538        final List<String> list = new ArrayList<>(tokens.length);
539        Collections.addAll(list, tokens);
540
541        return list;
542    }
543
544    /**
545     * Gets the trimmer character matcher.
546     * <p>
547     * These characters are trimmed off on each side of the delimiter
548     * until the token or quote is found.
549     * The default value is not to trim anything.
550     * </p>
551     *
552     * @return The trimmer matcher in use
553     */
554    public StrMatcher getTrimmerMatcher() {
555        return trimmerMatcher;
556    }
557
558    /**
559     * Checks whether there are any more tokens.
560     *
561     * @return true if there are more tokens
562     */
563    @Override
564    public boolean hasNext() {
565        checkTokenized();
566        return tokenPos < tokens.length;
567    }
568
569    /**
570     * Checks whether there are any previous tokens that can be iterated to.
571     *
572     * @return true if there are previous tokens
573     */
574    @Override
575    public boolean hasPrevious() {
576        checkTokenized();
577        return tokenPos > 0;
578    }
579
580    /**
581     * Gets whether the tokenizer currently returns empty tokens as null.
582     * The default for this property is false.
583     *
584     * @return true if empty tokens are returned as null
585     */
586    public boolean isEmptyTokenAsNull() {
587        return this.emptyAsNull;
588    }
589
590    /**
591     * Gets whether the tokenizer currently ignores empty tokens.
592     * The default for this property is true.
593     *
594     * @return true if empty tokens are not returned
595     */
596    public boolean isIgnoreEmptyTokens() {
597        return ignoreEmptyTokens;
598    }
599
600    /**
601     * Checks if the characters at the index specified match the quote
602     * already matched in readNextToken().
603     *
604     * @param srcChars  the character array being tokenized
605     * @param pos  the position to check for a quote
606     * @param len  the length of the character array being tokenized
607     * @param quoteStart  the start position of the matched quote, 0 if no quoting
608     * @param quoteLen  the length of the matched quote, 0 if no quoting
609     * @return true if a quote is matched
610     */
611    private boolean isQuote(final char[] srcChars,
612                            final int pos,
613                            final int len,
614                            final int quoteStart,
615                            final int quoteLen) {
616        for (int i = 0; i < quoteLen; i++) {
617            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
618                return false;
619            }
620        }
621        return true;
622    }
623
624    /**
625     * Gets the next token.
626     *
627     * @return The next String token
628     * @throws NoSuchElementException if there are no more elements
629     */
630    @Override
631    public String next() {
632        if (hasNext()) {
633            return tokens[tokenPos++];
634        }
635        throw new NoSuchElementException();
636    }
637
638    /**
639     * Gets the index of the next token to return.
640     *
641     * @return The next token index
642     */
643    @Override
644    public int nextIndex() {
645        return tokenPos;
646    }
647
648    /**
649     * Gets the next token from the String.
650     * Equivalent to {@link #next()} except it returns null rather than
651     * throwing {@link NoSuchElementException} when no tokens remain.
652     *
653     * @return The next sequential token, or null when no more tokens are found
654     */
655    public String nextToken() {
656        if (hasNext()) {
657            return tokens[tokenPos++];
658        }
659        return null;
660    }
661
662    /**
663     * Gets the token previous to the last returned token.
664     *
665     * @return The previous token
666     */
667    @Override
668    public String previous() {
669        if (hasPrevious()) {
670            return tokens[--tokenPos];
671        }
672        throw new NoSuchElementException();
673    }
674
675    /**
676     * Gets the index of the previous token.
677     *
678     * @return The previous token index
679     */
680    @Override
681    public int previousIndex() {
682        return tokenPos - 1;
683    }
684
685    /**
686     * Gets the previous token from the String.
687     *
688     * @return The previous sequential token, or null when no more tokens are found
689     */
690    public String previousToken() {
691        if (hasPrevious()) {
692            return tokens[--tokenPos];
693        }
694        return null;
695    }
696
697    /**
698     * Reads character by character through the String to get the next token.
699     *
700     * @param srcChars  the character array being tokenized
701     * @param start  the first character of field
702     * @param len  the length of the character array being tokenized
703     * @param workArea  a temporary work area
704     * @param tokenList  the list of parsed tokens
705     * @return The starting position of the next field (the character
706     *  immediately after the delimiter), or -1 if end of string found
707     */
708    private int readNextToken(final char[] srcChars,
709                              int start,
710                              final int len,
711                              final StrBuilder workArea,
712                              final List<String> tokenList) {
713        // skip all leading whitespace, unless it is the
714        // field delimiter or the quote character
715        while (start < len) {
716            final int removeLen = Math.max(
717                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
718                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
719            if (removeLen == 0
720                    || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
721                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
722                break;
723            }
724            start += removeLen;
725        }
726
727        // handle reaching end
728        if (start >= len) {
729            addToken(tokenList, StringUtils.EMPTY);
730            return -1;
731        }
732
733        // handle empty token
734        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
735        if (delimLen > 0) {
736            addToken(tokenList, StringUtils.EMPTY);
737            return start + delimLen;
738        }
739
740        // handle found token
741        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
742        if (quoteLen > 0) {
743            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
744        }
745        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
746    }
747
748    /**
749     * Reads a possibly quoted string token.
750     *
751     * @param srcChars  the character array being tokenized
752     * @param start  the first character of field
753     * @param len  the length of the character array being tokenized
754     * @param workArea  a temporary work area
755     * @param tokenList  the list of parsed tokens
756     * @param quoteStart  the start position of the matched quote, 0 if no quoting
757     * @param quoteLen  the length of the matched quote, 0 if no quoting
758     * @return The starting position of the next field (the character
759     *  immediately after the delimiter, or if end of string found,
760     *  then the length of string
761     */
762    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
763                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
764        // Loop until we've found the end of the quoted
765        // string or the end of the input
766        workArea.clear();
767        int pos = start;
768        boolean quoting = quoteLen > 0;
769        int trimStart = 0;
770
771        while (pos < len) {
772            // quoting mode can occur several times throughout a string
773            // we must switch between quoting and non-quoting until we
774            // encounter a non-quoted delimiter, or end of string
775            if (quoting) {
776                // In quoting mode
777
778                // If we've found a quote character, see if it's
779                // followed by a second quote.  If so, then we need
780                // to actually put the quote character into the token
781                // rather than end the token.
782                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
783                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
784                        // matched pair of quotes, thus an escaped quote
785                        workArea.append(srcChars, pos, quoteLen);
786                        pos += quoteLen * 2;
787                        trimStart = workArea.size();
788                        continue;
789                    }
790
791                    // end of quoting
792                    quoting = false;
793                    pos += quoteLen;
794                    continue;
795                }
796
797            } else {
798                // Not in quoting mode
799
800                // check for delimiter, and thus end of token
801                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
802                if (delimLen > 0) {
803                    // return condition when end of token found
804                    addToken(tokenList, workArea.substring(0, trimStart));
805                    return pos + delimLen;
806                }
807
808                // check for quote, and thus back into quoting mode
809                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
810                    quoting = true;
811                    pos += quoteLen;
812                    continue;
813                }
814
815                // check for ignored (outside quotes), and ignore
816                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
817                if (ignoredLen > 0) {
818                    pos += ignoredLen;
819                    continue;
820                }
821
822                // check for trimmed character
823                // don't yet know if its at the end, so copy to workArea
824                // use trimStart to keep track of trim at the end
825                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
826                if (trimmedLen > 0) {
827                    workArea.append(srcChars, pos, trimmedLen);
828                    pos += trimmedLen;
829                    continue;
830                }
831
832            }
833            // copy regular character from inside quotes
834            workArea.append(srcChars[pos++]);
835            trimStart = workArea.size();
836        }
837
838        // return condition when end of string found
839        addToken(tokenList, workArea.substring(0, trimStart));
840        return -1;
841    }
842
843    /**
844     * Unsupported ListIterator operation.
845     *
846     * @throws UnsupportedOperationException always
847     */
848    @Override
849    public void remove() {
850        throw new UnsupportedOperationException("remove() is unsupported");
851    }
852
853    /**
854     * Resets this tokenizer, forgetting all parsing and iteration already completed.
855     * <p>
856     * This method allows the same tokenizer to be reused for the same String.
857     *
858     * @return this, to enable chaining
859     */
860    public StrTokenizer reset() {
861        tokenPos = 0;
862        tokens = null;
863        return this;
864    }
865
866    /**
867     * Reset this tokenizer, giving it a new input string to parse.
868     * In this manner you can re-use a tokenizer with the same settings
869     * on multiple input lines.
870     *
871     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
872     * @return this, to enable chaining
873     */
874    public StrTokenizer reset(final char[] input) {
875        reset();
876        if (input != null) {
877            this.chars = input.clone();
878        } else {
879            this.chars = null;
880        }
881        return this;
882    }
883
884    /**
885     * Reset this tokenizer, giving it a new input string to parse.
886     * In this manner you can re-use a tokenizer with the same settings
887     * on multiple input lines.
888     *
889     * @param input  the new string to tokenize, null sets no text to parse
890     * @return this, to enable chaining
891     */
892    public StrTokenizer reset(final String input) {
893        reset();
894        if (input != null) {
895            this.chars = input.toCharArray();
896        } else {
897            this.chars = null;
898        }
899        return this;
900    }
901
902    /**
903     * Unsupported ListIterator operation.
904     * @param obj this parameter ignored.
905     * @throws UnsupportedOperationException always
906     */
907    @Override
908    public void set(final String obj) {
909        throw new UnsupportedOperationException("set() is unsupported");
910    }
911
912    /**
913     * Sets the field delimiter character.
914     *
915     * @param delim  the delimiter character to use
916     * @return this, to enable chaining
917     */
918    public StrTokenizer setDelimiterChar(final char delim) {
919        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
920    }
921
922    /**
923     * Sets the field delimiter matcher.
924     * <p>
925     * The delimiter is used to separate one token from another.
926     * </p>
927     *
928     * @param delim  the delimiter matcher to use
929     * @return this, to enable chaining
930     */
931    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
932        if (delim == null) {
933            this.delimMatcher = StrMatcher.noneMatcher();
934        } else {
935            this.delimMatcher = delim;
936        }
937        return this;
938    }
939
940    /**
941     * Sets the field delimiter string.
942     *
943     * @param delim  the delimiter string to use
944     * @return this, to enable chaining
945     */
946    public StrTokenizer setDelimiterString(final String delim) {
947        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
948    }
949
950    /**
951     * Sets whether the tokenizer should return empty tokens as null.
952     * The default for this property is false.
953     *
954     * @param emptyAsNull  whether empty tokens are returned as null
955     * @return this, to enable chaining
956     */
957    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
958        this.emptyAsNull = emptyAsNull;
959        return this;
960    }
961
962    /**
963     * Sets the character to ignore.
964     * <p>
965     * This character is ignored when parsing the String, unless it is
966     * within a quoted region.
967     * </p>
968     *
969     * @param ignored  the ignored character to use
970     * @return this, to enable chaining
971     */
972    public StrTokenizer setIgnoredChar(final char ignored) {
973        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974    }
975
976    /**
977     * Sets the matcher for characters to ignore.
978     * <p>
979     * These characters are ignored when parsing the String, unless they are
980     * within a quoted region.
981     * </p>
982     *
983     * @param ignored  the ignored matcher to use, null ignored
984     * @return this, to enable chaining
985     */
986    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
987        if (ignored != null) {
988            this.ignoredMatcher = ignored;
989        }
990        return this;
991    }
992
993    /**
994     * Sets whether the tokenizer should ignore and not return empty tokens.
995     * The default for this property is true.
996     *
997     * @param ignoreEmptyTokens  whether empty tokens are not returned
998     * @return this, to enable chaining
999     */
1000    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1001        this.ignoreEmptyTokens = ignoreEmptyTokens;
1002        return this;
1003    }
1004
1005    /**
1006     * Sets the quote character to use.
1007     * <p>
1008     * The quote character is used to wrap data between the tokens.
1009     * This enables delimiters to be entered as data.
1010     * </p>
1011     *
1012     * @param quote  the quote character to use
1013     * @return this, to enable chaining
1014     */
1015    public StrTokenizer setQuoteChar(final char quote) {
1016        return setQuoteMatcher(StrMatcher.charMatcher(quote));
1017    }
1018
1019    /**
1020     * Sets the quote matcher to use.
1021     * <p>
1022     * The quote character is used to wrap data between the tokens.
1023     * This enables delimiters to be entered as data.
1024     * </p>
1025     *
1026     * @param quote  the quote matcher to use, null ignored
1027     * @return this, to enable chaining
1028     */
1029    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1030        if (quote != null) {
1031            this.quoteMatcher = quote;
1032        }
1033        return this;
1034    }
1035
1036    /**
1037     * Sets the matcher for characters to trim.
1038     * <p>
1039     * These characters are trimmed off on each side of the delimiter
1040     * until the token or quote is found.
1041     * </p>
1042     *
1043     * @param trimmer  the trimmer matcher to use, null ignored
1044     * @return this, to enable chaining
1045     */
1046    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1047        if (trimmer != null) {
1048            this.trimmerMatcher = trimmer;
1049        }
1050        return this;
1051    }
1052
1053    /**
1054     * Gets the number of tokens found in the String.
1055     *
1056     * @return The number of matched tokens
1057     */
1058    public int size() {
1059        checkTokenized();
1060        return tokens.length;
1061    }
1062
1063    /**
1064     * Internal method to performs the tokenization.
1065     * <p>
1066     * Most users of this class do not need to call this method. This method
1067     * will be called automatically by other (public) methods when required.
1068     * </p>
1069     * <p>
1070     * This method exists to allow subclasses to add code before or after the
1071     * tokenization. For example, a subclass could alter the character array,
1072     * offset or count to be parsed, or call the tokenizer multiple times on
1073     * multiple strings. It is also be possible to filter the results.
1074     * </p>
1075     * <p>
1076     * {@code StrTokenizer} will always pass a zero offset and a count
1077     * equal to the length of the array to this method, however a subclass
1078     * may pass other values, or even an entirely different array.
1079     * </p>
1080     *
1081     * @param srcChars  the character array being tokenized, may be null
1082     * @param offset  the start position within the character array, must be valid
1083     * @param count  the number of characters to tokenize, must be valid
1084     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1085     */
1086    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1087        if (srcChars == null || count == 0) {
1088            return Collections.emptyList();
1089        }
1090        final StrBuilder buf = new StrBuilder();
1091        final List<String> tokenList = new ArrayList<>();
1092        int pos = offset;
1093
1094        // loop around the entire buffer
1095        while (pos >= 0 && pos < count) {
1096            // find next token
1097            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1098
1099            // handle case where end of string is a delimiter
1100            if (pos >= count) {
1101                addToken(tokenList, StringUtils.EMPTY);
1102            }
1103        }
1104        return tokenList;
1105    }
1106
1107    /**
1108     * Gets the String content that the tokenizer is parsing.
1109     *
1110     * @return The string content being parsed
1111     */
1112    @Override
1113    public String toString() {
1114        if (tokens == null) {
1115            return "StrTokenizer[not tokenized yet]";
1116        }
1117        return "StrTokenizer" + getTokenList();
1118    }
1119
1120}