001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025import java.util.StringTokenizer;
026
027import org.apache.commons.lang3.ArrayUtils;
028import org.apache.commons.lang3.StringUtils;
029
030/**
031 * Tokenizes a string based on delimiters (separators)
032 * and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims
035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
036 * however it offers much more control and flexibility including implementing
037 * the {@link ListIterator} interface. By default, it is set up
038 * like {@link StringTokenizer}.
039 * </p>
040 * <p>
041 * The input String is split into a number of <em>tokens</em>.
042 * Each token is separated from the next String by a <em>delimiter</em>.
043 * One or more delimiter characters must be specified.
044 * </p>
045 * <p>
046 * Each token may be surrounded by quotes.
047 * The <em>quote</em> matcher specifies the quote character(s).
048 * A quote may be escaped within a quoted section by duplicating itself.
049 * </p>
050 * <p>
051 * Between each token and the delimiter are potentially characters that need trimming.
052 * The <em>trimmer</em> matcher specifies these characters.
053 * One usage might be to trim whitespace characters.
054 * </p>
055 * <p>
056 * At any point outside the quotes there might potentially be invalid characters.
057 * The <em>ignored</em> matcher specifies these characters to be removed.
058 * One usage might be to remove new line characters.
059 * </p>
060 * <p>
061 * Empty tokens may be removed or returned as null.
062 * </p>
063 * <pre>
064 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
065 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
067 * </pre>
068 *
069 * <table>
070 *  <caption>StrTokenizer properties and options</caption>
071 *  <tr>
072 *   <th>Property</th><th>Type</th><th>Default</th>
073 *  </tr>
074 *  <tr>
075 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
082 *  </tr>
083 *  <tr>
084 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
085 *  </tr>
086 *  <tr>
087 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
088 *  </tr>
089 * </table>
090 *
091 * @since 2.2
092 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
094 * StringTokenizer</a>.
095 */
096@Deprecated
097public class StrTokenizer implements ListIterator<String>, Cloneable {
098
099    // @formatter:off
100    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101            .setDelimiterMatcher(StrMatcher.commaMatcher())
102            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103            .setIgnoredMatcher(StrMatcher.noneMatcher())
104            .setTrimmerMatcher(StrMatcher.trimMatcher())
105            .setEmptyTokenAsNull(false)
106            .setIgnoreEmptyTokens(false);
107    // @formatter:on
108
109    // @formatter:off
110    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111            .setDelimiterMatcher(StrMatcher.tabMatcher())
112            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113            .setIgnoredMatcher(StrMatcher.noneMatcher())
114            .setTrimmerMatcher(StrMatcher.trimMatcher())
115            .setEmptyTokenAsNull(false)
116            .setIgnoreEmptyTokens(false);
117    // @formatter:on
118
119    /**
120     * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121     *
122     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123     */
124    private static StrTokenizer getCSVClone() {
125        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126    }
127    /**
128     * Gets a new tokenizer instance which parses Comma Separated Value strings
129     * initializing it with the given input.  The default for CSV processing
130     * will be trim whitespace from both ends (which can be overridden with
131     * the setTrimmer method).
132     * <p>
133     * You must call a "reset" method to set the string which you want to parse.
134     * </p>
135     * @return a new tokenizer instance which parses Comma Separated Value strings.
136     */
137    public static StrTokenizer getCSVInstance() {
138        return getCSVClone();
139    }
140    /**
141     * Gets a new tokenizer instance which parses Comma Separated Value strings
142     * initializing it with the given input.  The default for CSV processing
143     * will be trim whitespace from both ends (which can be overridden with
144     * the setTrimmer method).
145     *
146     * @param input  the text to parse.
147     * @return a new tokenizer instance which parses Comma Separated Value strings.
148     */
149    public static StrTokenizer getCSVInstance(final char[] input) {
150        final StrTokenizer tok = getCSVClone();
151        tok.reset(input);
152        return tok;
153    }
154
155    /**
156     * Gets a new tokenizer instance which parses Comma Separated Value strings
157     * initializing it with the given input.  The default for CSV processing
158     * will be trim whitespace from both ends (which can be overridden with
159     * the setTrimmer method).
160     *
161     * @param input  the text to parse.
162     * @return a new tokenizer instance which parses Comma Separated Value strings.
163     */
164    public static StrTokenizer getCSVInstance(final String input) {
165        final StrTokenizer tok = getCSVClone();
166        tok.reset(input);
167        return tok;
168    }
169    /**
170     * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171     *
172     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173     */
174    private static StrTokenizer getTSVClone() {
175        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176    }
177
178    /**
179     * Gets a new tokenizer instance which parses Tab Separated Value strings.
180     * The default for CSV processing will be trim whitespace from both ends
181     * (which can be overridden with the setTrimmer method).
182     * <p>
183     * You must call a "reset" method to set the string which you want to parse.
184     * </p>
185     * @return a new tokenizer instance which parses Tab Separated Value strings.
186     */
187    public static StrTokenizer getTSVInstance() {
188        return getTSVClone();
189    }
190
191    /**
192     * Gets a new tokenizer instance which parses Tab Separated Value strings.
193     * The default for CSV processing will be trim whitespace from both ends
194     * (which can be overridden with the setTrimmer method).
195     *
196     * @param input  the string to parse.
197     * @return a new tokenizer instance which parses Tab Separated Value strings.
198     */
199    public static StrTokenizer getTSVInstance(final char[] input) {
200        final StrTokenizer tok = getTSVClone();
201        tok.reset(input);
202        return tok;
203    }
204
205    /**
206     * Gets a new tokenizer instance which parses Tab Separated Value strings.
207     * The default for CSV processing will be trim whitespace from both ends
208     * (which can be overridden with the setTrimmer method).
209     *
210     * @param input  the string to parse.
211     * @return a new tokenizer instance which parses Tab Separated Value strings.
212     */
213    public static StrTokenizer getTSVInstance(final String input) {
214        final StrTokenizer tok = getTSVClone();
215        tok.reset(input);
216        return tok;
217    }
218    /** The text to work on. */
219    private char[] chars;
220
221    /** The parsed tokens */
222    private String[] tokens;
223
224    /** The current iteration position */
225    private int tokenPos;
226
227    /** The delimiter matcher */
228    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
229
230    /** The quote matcher */
231    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
232
233    /** The ignored matcher */
234    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
235
236    /** The trimmer matcher */
237    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
238
239    /** Whether to return empty tokens as null */
240    private boolean emptyAsNull;
241
242    /** Whether to ignore empty tokens */
243    private boolean ignoreEmptyTokens = true;
244
245    /**
246     * Constructs a tokenizer splitting on space, tab, newline and formfeed
247     * as per StringTokenizer, but with no text to tokenize.
248     * <p>
249     * This constructor is normally used with {@link #reset(String)}.
250     * </p>
251     */
252    public StrTokenizer() {
253        this.chars = null;
254    }
255
256    /**
257     * Constructs a tokenizer splitting on space, tab, newline and formfeed
258     * as per StringTokenizer.
259     *
260     * @param input  the string which is to be parsed, not cloned.
261     */
262    public StrTokenizer(final char[] input) {
263        this.chars = ArrayUtils.clone(input);
264    }
265
266    /**
267     * Constructs a tokenizer splitting on the specified character.
268     *
269     * @param input  the string which is to be parsed, not cloned.
270     * @param delim the field delimiter character.
271     */
272    public StrTokenizer(final char[] input, final char delim) {
273        this(input);
274        setDelimiterChar(delim);
275    }
276
277    /**
278     * Constructs a tokenizer splitting on the specified delimiter character
279     * and handling quotes using the specified quote character.
280     *
281     * @param input  the string which is to be parsed, not cloned.
282     * @param delim  the field delimiter character.
283     * @param quote  the field quoted string character.
284     */
285    public StrTokenizer(final char[] input, final char delim, final char quote) {
286        this(input, delim);
287        setQuoteChar(quote);
288    }
289
290    /**
291     * Constructs a tokenizer splitting on the specified string.
292     *
293     * @param input  the string which is to be parsed, not cloned.
294     * @param delim the field delimiter string.
295     */
296    public StrTokenizer(final char[] input, final String delim) {
297        this(input);
298        setDelimiterString(delim);
299    }
300
301    /**
302     * Constructs a tokenizer splitting using the specified delimiter matcher.
303     *
304     * @param input  the string which is to be parsed, not cloned.
305     * @param delim  the field delimiter matcher.
306     */
307    public StrTokenizer(final char[] input, final StrMatcher delim) {
308        this(input);
309        setDelimiterMatcher(delim);
310    }
311
312    /**
313     * Constructs a tokenizer splitting using the specified delimiter matcher
314     * and handling quotes using the specified quote matcher.
315     *
316     * @param input  the string which is to be parsed, not cloned.
317     * @param delim  the field delimiter character.
318     * @param quote  the field quoted string character.
319     */
320    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
321        this(input, delim);
322        setQuoteMatcher(quote);
323    }
324
325    /**
326     * Constructs a tokenizer splitting on space, tab, newline and formfeed
327     * as per StringTokenizer.
328     *
329     * @param input  the string which is to be parsed.
330     */
331    public StrTokenizer(final String input) {
332        if (input != null) {
333            chars = input.toCharArray();
334        } else {
335            chars = null;
336        }
337    }
338
339    /**
340     * Constructs a tokenizer splitting on the specified delimiter character.
341     *
342     * @param input  the string which is to be parsed.
343     * @param delim  the field delimiter character.
344     */
345    public StrTokenizer(final String input, final char delim) {
346        this(input);
347        setDelimiterChar(delim);
348    }
349
350    /**
351     * Constructs a tokenizer splitting on the specified delimiter character
352     * and handling quotes using the specified quote character.
353     *
354     * @param input  the string which is to be parsed.
355     * @param delim  the field delimiter character.
356     * @param quote  the field quoted string character.
357     */
358    public StrTokenizer(final String input, final char delim, final char quote) {
359        this(input, delim);
360        setQuoteChar(quote);
361    }
362
363    /**
364     * Constructs a tokenizer splitting on the specified delimiter string.
365     *
366     * @param input  the string which is to be parsed.
367     * @param delim  the field delimiter string.
368     */
369    public StrTokenizer(final String input, final String delim) {
370        this(input);
371        setDelimiterString(delim);
372    }
373
374    /**
375     * Constructs a tokenizer splitting using the specified delimiter matcher.
376     *
377     * @param input  the string which is to be parsed.
378     * @param delim  the field delimiter matcher.
379     */
380    public StrTokenizer(final String input, final StrMatcher delim) {
381        this(input);
382        setDelimiterMatcher(delim);
383    }
384
385    /**
386     * Constructs a tokenizer splitting using the specified delimiter matcher
387     * and handling quotes using the specified quote matcher.
388     *
389     * @param input  the string which is to be parsed.
390     * @param delim  the field delimiter matcher.
391     * @param quote  the field quoted string matcher.
392     */
393    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
394        this(input, delim);
395        setQuoteMatcher(quote);
396    }
397
398    /**
399     * Unsupported ListIterator operation.
400     *
401     * @param obj this parameter ignored.
402     * @throws UnsupportedOperationException always.
403     */
404    @Override
405    public void add(final String obj) {
406        throw new UnsupportedOperationException("add() is unsupported");
407    }
408
409    /**
410     * Adds a token to a list, paying attention to the parameters we've set.
411     *
412     * @param list  the list to add to.
413     * @param tok  the token to add.
414     */
415    private void addToken(final List<String> list, String tok) {
416        if (StringUtils.isEmpty(tok)) {
417            if (isIgnoreEmptyTokens()) {
418                return;
419            }
420            if (isEmptyTokenAsNull()) {
421                tok = null;
422            }
423        }
424        list.add(tok);
425    }
426
427    /**
428     * Checks if tokenization has been done, and if not then do it.
429     */
430    private void checkTokenized() {
431        if (tokens == null) {
432            if (chars == null) {
433                // still call tokenize as subclass may do some work
434                final List<String> split = tokenize(null, 0, 0);
435                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
436            } else {
437                final List<String> split = tokenize(chars, 0, chars.length);
438                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
439            }
440        }
441    }
442
443    /**
444     * Creates a new instance of this Tokenizer. The new instance is reset so
445     * that it will be at the start of the token list.
446     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
447     *
448     * @return a new instance of this Tokenizer which has been reset.
449     */
450    @Override
451    public Object clone() {
452        try {
453            return cloneReset();
454        } catch (final CloneNotSupportedException ex) {
455            return null;
456        }
457    }
458
459    /**
460     * Creates a new instance of this Tokenizer. The new instance is reset so that
461     * it will be at the start of the token list.
462     *
463     * @return a new instance of this Tokenizer which has been reset.
464     * @throws CloneNotSupportedException if there is a problem cloning.
465     */
466    Object cloneReset() throws CloneNotSupportedException {
467        // this method exists to enable 100% test coverage
468        final StrTokenizer cloned = (StrTokenizer) super.clone();
469        if (cloned.chars != null) {
470            cloned.chars = cloned.chars.clone();
471        }
472        cloned.reset();
473        return cloned;
474    }
475
476    /**
477     * Gets the String content that the tokenizer is parsing.
478     *
479     * @return the string content being parsed.
480     */
481    public String getContent() {
482        if (chars == null) {
483            return null;
484        }
485        return new String(chars);
486    }
487
488    /**
489     * Gets the field delimiter matcher.
490     *
491     * @return the delimiter matcher in use.
492     */
493    public StrMatcher getDelimiterMatcher() {
494        return this.delimMatcher;
495    }
496
497    // Ignored
498    /**
499     * Gets the ignored character matcher.
500     * <p>
501     * These characters are ignored when parsing the String, unless they are
502     * within a quoted region.
503     * The default value is not to ignore anything.
504     * </p>
505     *
506     * @return the ignored matcher in use.
507     */
508    public StrMatcher getIgnoredMatcher() {
509        return ignoredMatcher;
510    }
511
512    /**
513     * Gets the quote matcher currently in use.
514     * <p>
515     * The quote character is used to wrap data between the tokens.
516     * This enables delimiters to be entered as data.
517     * The default value is '"' (double quote).
518     * </p>
519     *
520     * @return the quote matcher in use.
521     */
522    public StrMatcher getQuoteMatcher() {
523        return quoteMatcher;
524    }
525
526    /**
527     * Gets a copy of the full token list as an independent modifiable array.
528     *
529     * @return the tokens as a String array.
530     */
531    public String[] getTokenArray() {
532        checkTokenized();
533        return tokens.clone();
534    }
535
536    /**
537     * Gets a copy of the full token list as an independent modifiable list.
538     *
539     * @return the tokens as a String array.
540     */
541    public List<String> getTokenList() {
542        checkTokenized();
543        final List<String> list = new ArrayList<>(tokens.length);
544        list.addAll(Arrays.asList(tokens));
545        return list;
546    }
547
548    /**
549     * Gets the trimmer character matcher.
550     * <p>
551     * These characters are trimmed off on each side of the delimiter
552     * until the token or quote is found.
553     * The default value is not to trim anything.
554     * </p>
555     *
556     * @return the trimmer matcher in use.
557     */
558    public StrMatcher getTrimmerMatcher() {
559        return trimmerMatcher;
560    }
561
562    /**
563     * Checks whether there are any more tokens.
564     *
565     * @return true if there are more tokens.
566     */
567    @Override
568    public boolean hasNext() {
569        checkTokenized();
570        return tokenPos < tokens.length;
571    }
572
573    /**
574     * Checks whether there are any previous tokens that can be iterated to.
575     *
576     * @return true if there are previous tokens.
577     */
578    @Override
579    public boolean hasPrevious() {
580        checkTokenized();
581        return tokenPos > 0;
582    }
583
584    /**
585     * Gets whether the tokenizer currently returns empty tokens as null.
586     * The default for this property is false.
587     *
588     * @return true if empty tokens are returned as null.
589     */
590    public boolean isEmptyTokenAsNull() {
591        return this.emptyAsNull;
592    }
593
594    /**
595     * Gets whether the tokenizer currently ignores empty tokens.
596     * The default for this property is true.
597     *
598     * @return true if empty tokens are not returned.
599     */
600    public boolean isIgnoreEmptyTokens() {
601        return ignoreEmptyTokens;
602    }
603
604    /**
605     * Checks if the characters at the index specified match the quote
606     * already matched in readNextToken().
607     *
608     * @param srcChars  the character array being tokenized.
609     * @param pos  the position to check for a quote.
610     * @param len  the length of the character array being tokenized.
611     * @param quoteStart  the start position of the matched quote, 0 if no quoting.
612     * @param quoteLen  the length of the matched quote, 0 if no quoting.
613     * @return true if a quote is matched.
614     */
615    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
616        for (int i = 0; i < quoteLen; i++) {
617            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
618                return false;
619            }
620        }
621        return true;
622    }
623
624    /**
625     * Gets the next token.
626     *
627     * @return the next String token.
628     * @throws NoSuchElementException if there are no more elements.
629     */
630    @Override
631    public String next() {
632        if (hasNext()) {
633            return tokens[tokenPos++];
634        }
635        throw new NoSuchElementException();
636    }
637
638    /**
639     * Gets the index of the next token to return.
640     *
641     * @return the next token index.
642     */
643    @Override
644    public int nextIndex() {
645        return tokenPos;
646    }
647
648    /**
649     * Gets the next token from the String.
650     * Equivalent to {@link #next()} except it returns null rather than
651     * throwing {@link NoSuchElementException} when no tokens remain.
652     *
653     * @return the next sequential token, or null when no more tokens are found.
654     */
655    public String nextToken() {
656        if (hasNext()) {
657            return tokens[tokenPos++];
658        }
659        return null;
660    }
661
662    /**
663     * Gets the token previous to the last returned token.
664     *
665     * @return the previous token.
666     */
667    @Override
668    public String previous() {
669        if (hasPrevious()) {
670            return tokens[--tokenPos];
671        }
672        throw new NoSuchElementException();
673    }
674
675    /**
676     * Gets the index of the previous token.
677     *
678     * @return the previous token index.
679     */
680    @Override
681    public int previousIndex() {
682        return tokenPos - 1;
683    }
684
685    /**
686     * Gets the previous token from the String.
687     *
688     * @return the previous sequential token, or null when no more tokens are found.
689     */
690    public String previousToken() {
691        if (hasPrevious()) {
692            return tokens[--tokenPos];
693        }
694        return null;
695    }
696
697    /**
698     * Reads character by character through the String to get the next token.
699     *
700     * @param srcChars  the character array being tokenized.
701     * @param start  the first character of field.
702     * @param len  the length of the character array being tokenized.
703     * @param workArea  a temporary work area.
704     * @param tokenList  the list of parsed tokens.
705     * @return the starting position of the next field (the character
706     *  immediately after the delimiter), or -1 if end of string found.
707     */
708    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
709        // skip all leading whitespace, unless it is the
710        // field delimiter or the quote character
711        while (start < len) {
712            final int removeLen = Math.max(
713                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
714                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
715            if (removeLen == 0 ||
716                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
717                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
718                break;
719            }
720            start += removeLen;
721        }
722
723        // handle reaching end
724        if (start >= len) {
725            addToken(tokenList, StringUtils.EMPTY);
726            return -1;
727        }
728
729        // handle empty token
730        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
731        if (delimLen > 0) {
732            addToken(tokenList, StringUtils.EMPTY);
733            return start + delimLen;
734        }
735
736        // handle found token
737        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
738        if (quoteLen > 0) {
739            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
740        }
741        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
742    }
743
744    /**
745     * Reads a possibly quoted string token.
746     *
747     * @param srcChars  the character array being tokenized.
748     * @param start  the first character of field.
749     * @param len  the length of the character array being tokenized.
750     * @param workArea  a temporary work area.
751     * @param tokenList  the list of parsed tokens.
752     * @param quoteStart  the start position of the matched quote, 0 if no quoting.
753     * @param quoteLen  the length of the matched quote, 0 if no quoting.
754     * @return the starting position of the next field (the character
755     *  immediately after the delimiter, or if end of string found,
756     *  then the length of string.
757     */
758    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
759                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
760        // Loop until we've found the end of the quoted
761        // string or the end of the input
762        workArea.clear();
763        int pos = start;
764        boolean quoting = quoteLen > 0;
765        int trimStart = 0;
766
767        while (pos < len) {
768            // quoting mode can occur several times throughout a string
769            // we must switch between quoting and non-quoting until we
770            // encounter a non-quoted delimiter, or end of string
771            if (quoting) {
772                // In quoting mode
773
774                // If we've found a quote character, see if it's
775                // followed by a second quote.  If so, then we need
776                // to actually put the quote character into the token
777                // rather than end the token.
778                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
779                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
780                        // matched pair of quotes, thus an escaped quote
781                        workArea.append(srcChars, pos, quoteLen);
782                        pos += quoteLen * 2;
783                        trimStart = workArea.size();
784                        continue;
785                    }
786
787                    // end of quoting
788                    quoting = false;
789                    pos += quoteLen;
790                    continue;
791                }
792
793            } else {
794                // Not in quoting mode
795
796                // check for delimiter, and thus end of token
797                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
798                if (delimLen > 0) {
799                    // return condition when end of token found
800                    addToken(tokenList, workArea.substring(0, trimStart));
801                    return pos + delimLen;
802                }
803
804                // check for quote, and thus back into quoting mode
805                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
806                    quoting = true;
807                    pos += quoteLen;
808                    continue;
809                }
810
811                // check for ignored (outside quotes), and ignore
812                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
813                if (ignoredLen > 0) {
814                    pos += ignoredLen;
815                    continue;
816                }
817
818                // check for trimmed character
819                // don't yet know if it's at the end, so copy to workArea
820                // use trimStart to keep track of trim at the end
821                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
822                if (trimmedLen > 0) {
823                    workArea.append(srcChars, pos, trimmedLen);
824                    pos += trimmedLen;
825                    continue;
826                }
827            }
828            // copy regular character from inside quotes
829            workArea.append(srcChars[pos++]);
830            trimStart = workArea.size();
831        }
832
833        // return condition when end of string found
834        addToken(tokenList, workArea.substring(0, trimStart));
835        return -1;
836    }
837
838    /**
839     * Unsupported ListIterator operation.
840     *
841     * @throws UnsupportedOperationException always.
842     */
843    @Override
844    public void remove() {
845        throw new UnsupportedOperationException("remove() is unsupported");
846    }
847
848    /**
849     * Resets this tokenizer, forgetting all parsing and iteration already completed.
850     * <p>
851     * This method allows the same tokenizer to be reused for the same String.
852     * </p>
853     *
854     * @return {@code this} instance.
855     */
856    public StrTokenizer reset() {
857        tokenPos = 0;
858        tokens = null;
859        return this;
860    }
861
862    /**
863     * Reset this tokenizer, giving it a new input string to parse.
864     * In this manner you can re-use a tokenizer with the same settings
865     * on multiple input lines.
866     *
867     * @param input  the new character array to tokenize, not cloned, null sets no text to parse.
868     * @return {@code this} instance.
869     */
870    public StrTokenizer reset(final char[] input) {
871        reset();
872        this.chars = ArrayUtils.clone(input);
873        return this;
874    }
875
876    /**
877     * Reset this tokenizer, giving it a new input string to parse.
878     * In this manner you can re-use a tokenizer with the same settings
879     * on multiple input lines.
880     *
881     * @param input  the new string to tokenize, null sets no text to parse.
882     * @return {@code this} instance.
883     */
884    public StrTokenizer reset(final String input) {
885        reset();
886        if (input != null) {
887            this.chars = input.toCharArray();
888        } else {
889            this.chars = null;
890        }
891        return this;
892    }
893
894    /**
895     * Unsupported ListIterator operation.
896     *
897     * @param obj this parameter ignored.
898     * @throws UnsupportedOperationException always.
899     */
900    @Override
901    public void set(final String obj) {
902        throw new UnsupportedOperationException("set() is unsupported");
903    }
904
905    /**
906     * Sets the field delimiter character.
907     *
908     * @param delim  the delimiter character to use.
909     * @return this, to enable chaining.
910     */
911    public StrTokenizer setDelimiterChar(final char delim) {
912        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
913    }
914
915    /**
916     * Sets the field delimiter matcher.
917     * <p>
918     * The delimiter is used to separate one token from another.
919     * </p>
920     *
921     * @param delim  the delimiter matcher to use.
922     * @return this, to enable chaining.
923     */
924    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
925        if (delim == null) {
926            this.delimMatcher = StrMatcher.noneMatcher();
927        } else {
928            this.delimMatcher = delim;
929        }
930        return this;
931    }
932
933    /**
934     * Sets the field delimiter string.
935     *
936     * @param delim  the delimiter string to use.
937     * @return this, to enable chaining.
938     */
939    public StrTokenizer setDelimiterString(final String delim) {
940        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
941    }
942
943    /**
944     * Sets whether the tokenizer should return empty tokens as null.
945     * The default for this property is false.
946     *
947     * @param emptyAsNull  whether empty tokens are returned as null.
948     * @return this, to enable chaining.
949     */
950    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
951        this.emptyAsNull = emptyAsNull;
952        return this;
953    }
954
955    /**
956     * Sets the character to ignore.
957     * <p>
958     * This character is ignored when parsing the String, unless it is
959     * within a quoted region.
960     *
961     * @param ignored  the ignored character to use.
962     * @return this, to enable chaining.
963     */
964    public StrTokenizer setIgnoredChar(final char ignored) {
965        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
966    }
967
968    /**
969     * Sets the matcher for characters to ignore.
970     * <p>
971     * These characters are ignored when parsing the String, unless they are
972     * within a quoted region.
973     * </p>
974     *
975     * @param ignored  the ignored matcher to use, null ignored.
976     * @return {@code this} instance.
977     */
978    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
979        if (ignored != null) {
980            this.ignoredMatcher = ignored;
981        }
982        return this;
983    }
984
985    /**
986     * Sets whether the tokenizer should ignore and not return empty tokens.
987     * The default for this property is true.
988     *
989     * @param ignoreEmptyTokens  whether empty tokens are not returned.
990     * @return {@code this} instance.
991     */
992    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
993        this.ignoreEmptyTokens = ignoreEmptyTokens;
994        return this;
995    }
996
997    /**
998     * Sets the quote character to use.
999     * <p>
1000     * The quote character is used to wrap data between the tokens.
1001     * This enables delimiters to be entered as data.
1002     * </p>
1003     *
1004     * @param quote  the quote character to use.
1005     * @return {@code this} instance.
1006     */
1007    public StrTokenizer setQuoteChar(final char quote) {
1008        return setQuoteMatcher(StrMatcher.charMatcher(quote));
1009    }
1010
1011    /**
1012     * Sets the quote matcher to use.
1013     * <p>
1014     * The quote character is used to wrap data between the tokens.
1015     * This enables delimiters to be entered as data.
1016     * </p>
1017     *
1018     * @param quote  the quote matcher to use, null ignored.
1019     * @return {@code this} instance.
1020     */
1021    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1022        if (quote != null) {
1023            this.quoteMatcher = quote;
1024        }
1025        return this;
1026    }
1027
1028    /**
1029     * Sets the matcher for characters to trim.
1030     * <p>
1031     * These characters are trimmed off on each side of the delimiter
1032     * until the token or quote is found.
1033     * </p>
1034     *
1035     * @param trimmer  the trimmer matcher to use, null ignored.
1036     * @return {@code this} instance.
1037     */
1038    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1039        if (trimmer != null) {
1040            this.trimmerMatcher = trimmer;
1041        }
1042        return this;
1043    }
1044
1045    // API
1046    /**
1047     * Gets the number of tokens found in the String.
1048     *
1049     * @return the number of matched tokens.
1050     */
1051    public int size() {
1052        checkTokenized();
1053        return tokens.length;
1054    }
1055
1056    /**
1057     * Internal method to performs the tokenization.
1058     * <p>
1059     * Most users of this class do not need to call this method. This method
1060     * will be called automatically by other (public) methods when required.
1061     * </p>
1062     * <p>
1063     * This method exists to allow subclasses to add code before or after the
1064     * tokenization. For example, a subclass could alter the character array,
1065     * offset or count to be parsed, or call the tokenizer multiple times on
1066     * multiple strings. It is also be possible to filter the results.
1067     * </p>
1068     * <p>
1069     * {@link StrTokenizer} will always pass a zero offset and a count
1070     * equal to the length of the array to this method, however a subclass
1071     * may pass other values, or even an entirely different array.
1072     * </p>
1073     *
1074     * @param srcChars  the character array being tokenized, may be null.
1075     * @param offset  the start position within the character array, must be valid.
1076     * @param count  the number of characters to tokenize, must be valid.
1077     * @return the modifiable list of String tokens, unmodifiable if null array or zero count.
1078     */
1079    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1080        if (ArrayUtils.isEmpty(srcChars)) {
1081            return Collections.emptyList();
1082        }
1083        final StrBuilder buf = new StrBuilder();
1084        final List<String> tokenList = new ArrayList<>();
1085        int pos = offset;
1086
1087        // loop around the entire buffer
1088        while (pos >= 0 && pos < count) {
1089            // find next token
1090            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1091
1092            // handle case where end of string is a delimiter
1093            if (pos >= count) {
1094                addToken(tokenList, StringUtils.EMPTY);
1095            }
1096        }
1097        return tokenList;
1098    }
1099
1100    /**
1101     * Gets the String content that the tokenizer is parsing.
1102     *
1103     * @return the string content being parsed.
1104     */
1105    @Override
1106    public String toString() {
1107        if (tokens == null) {
1108            return "StrTokenizer[not tokenized yet]";
1109        }
1110        return "StrTokenizer" + getTokenList();
1111    }
1112
1113}