001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025
026import org.apache.commons.lang3.ArrayUtils;
027import org.apache.commons.lang3.StringUtils;
028
029/**
030 * Tokenizes a string based on delimiters (separators)
031 * and supporting quoting and ignored character concepts.
032 * <p>
033 * This class can split a String into many smaller strings. It aims
034 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
035 * however it offers much more control and flexibility including implementing
036 * the <code>ListIterator</code> interface. By default, it is set up
037 * like <code>StringTokenizer</code>.
038 * <p>
039 * The input String is split into a number of <i>tokens</i>.
040 * Each token is separated from the next String by a <i>delimiter</i>.
041 * One or more delimiter characters must be specified.
042 * <p>
043 * Each token may be surrounded by quotes.
044 * The <i>quote</i> matcher specifies the quote character(s).
045 * A quote may be escaped within a quoted section by duplicating itself.
046 * <p>
047 * Between each token and the delimiter are potentially characters that need trimming.
048 * The <i>trimmer</i> matcher specifies these characters.
049 * One usage might be to trim whitespace characters.
050 * <p>
051 * At any point outside the quotes there might potentially be invalid characters.
052 * The <i>ignored</i> matcher specifies these characters to be removed.
053 * One usage might be to remove new line characters.
054 * <p>
055 * Empty tokens may be removed or returned as null.
056 * <pre>
057 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
058 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
059 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
060 * </pre>
061 *
062 * <table>
063 *  <caption>StrTokenizer properties and options</caption>
064 *  <tr>
065 *   <th>Property</th><th>Type</th><th>Default</th>
066 *  </tr>
067 *  <tr>
068 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069 *  </tr>
070 *  <tr>
071 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
072 *  </tr>
073 *  <tr>
074 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075 *  </tr>
076 *  <tr>
077 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078 *  </tr>
079 *  <tr>
080 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081 *  </tr>
082 * </table>
083 *
084 * @since 2.2
085 * @deprecated as of 3.6, use commons-text
086 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
087 * StringTokenizer</a> instead
088 */
089@Deprecated
090public class StrTokenizer implements ListIterator<String>, Cloneable {
091
092    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
093    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
094    static {
095        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
096        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
097        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
098        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
099        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
100        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
101        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
102
103        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
104        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
105        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
106        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
107        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
108        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
109        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
110    }
111
112    /** The text to work on. */
113    private char chars[];
114    /** The parsed tokens */
115    private String tokens[];
116    /** The current iteration position */
117    private int tokenPos;
118
119    /** The delimiter matcher */
120    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
121    /** The quote matcher */
122    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
123    /** The ignored matcher */
124    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
125    /** The trimmer matcher */
126    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
127
128    /** Whether to return empty tokens as null */
129    private boolean emptyAsNull = false;
130    /** Whether to ignore empty tokens */
131    private boolean ignoreEmptyTokens = true;
132
133    //-----------------------------------------------------------------------
134
135    /**
136     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137     *
138     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
139     */
140    private static StrTokenizer getCSVClone() {
141        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
142    }
143
144    /**
145     * Gets a new tokenizer instance which parses Comma Separated Value strings
146     * initializing it with the given input.  The default for CSV processing
147     * will be trim whitespace from both ends (which can be overridden with
148     * the setTrimmer method).
149     * <p>
150     * You must call a "reset" method to set the string which you want to parse.
151     * @return a new tokenizer instance which parses Comma Separated Value strings
152     */
153    public static StrTokenizer getCSVInstance() {
154        return getCSVClone();
155    }
156
157    /**
158     * Gets a new tokenizer instance which parses Comma Separated Value strings
159     * initializing it with the given input.  The default for CSV processing
160     * will be trim whitespace from both ends (which can be overridden with
161     * the setTrimmer method).
162     *
163     * @param input  the text to parse
164     * @return a new tokenizer instance which parses Comma Separated Value strings
165     */
166    public static StrTokenizer getCSVInstance(final String input) {
167        final StrTokenizer tok = getCSVClone();
168        tok.reset(input);
169        return tok;
170    }
171
172    /**
173     * Gets a new tokenizer instance which parses Comma Separated Value strings
174     * initializing it with the given input.  The default for CSV processing
175     * will be trim whitespace from both ends (which can be overridden with
176     * the setTrimmer method).
177     *
178     * @param input  the text to parse
179     * @return a new tokenizer instance which parses Comma Separated Value strings
180     */
181    public static StrTokenizer getCSVInstance(final char[] input) {
182        final StrTokenizer tok = getCSVClone();
183        tok.reset(input);
184        return tok;
185    }
186
187    /**
188     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189     *
190     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
191     */
192    private static StrTokenizer getTSVClone() {
193        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
194    }
195
196
197    /**
198     * Gets a new tokenizer instance which parses Tab Separated Value strings.
199     * The default for CSV processing will be trim whitespace from both ends
200     * (which can be overridden with the setTrimmer method).
201     * <p>
202     * You must call a "reset" method to set the string which you want to parse.
203     * @return a new tokenizer instance which parses Tab Separated Value strings.
204     */
205    public static StrTokenizer getTSVInstance() {
206        return getTSVClone();
207    }
208
209    /**
210     * Gets a new tokenizer instance which parses Tab Separated Value strings.
211     * The default for CSV processing will be trim whitespace from both ends
212     * (which can be overridden with the setTrimmer method).
213     * @param input  the string to parse
214     * @return a new tokenizer instance which parses Tab Separated Value strings.
215     */
216    public static StrTokenizer getTSVInstance(final String input) {
217        final StrTokenizer tok = getTSVClone();
218        tok.reset(input);
219        return tok;
220    }
221
222    /**
223     * Gets a new tokenizer instance which parses Tab Separated Value strings.
224     * The default for CSV processing will be trim whitespace from both ends
225     * (which can be overridden with the setTrimmer method).
226     * @param input  the string to parse
227     * @return a new tokenizer instance which parses Tab Separated Value strings.
228     */
229    public static StrTokenizer getTSVInstance(final char[] input) {
230        final StrTokenizer tok = getTSVClone();
231        tok.reset(input);
232        return tok;
233    }
234
235    //-----------------------------------------------------------------------
236    /**
237     * Constructs a tokenizer splitting on space, tab, newline and formfeed
238     * as per StringTokenizer, but with no text to tokenize.
239     * <p>
240     * This constructor is normally used with {@link #reset(String)}.
241     */
242    public StrTokenizer() {
243        super();
244        this.chars = null;
245    }
246
247    /**
248     * Constructs a tokenizer splitting on space, tab, newline and formfeed
249     * as per StringTokenizer.
250     *
251     * @param input  the string which is to be parsed
252     */
253    public StrTokenizer(final String input) {
254        super();
255        if (input != null) {
256            chars = input.toCharArray();
257        } else {
258            chars = null;
259        }
260    }
261
262    /**
263     * Constructs a tokenizer splitting on the specified delimiter character.
264     *
265     * @param input  the string which is to be parsed
266     * @param delim  the field delimiter character
267     */
268    public StrTokenizer(final String input, final char delim) {
269        this(input);
270        setDelimiterChar(delim);
271    }
272
273    /**
274     * Constructs a tokenizer splitting on the specified delimiter string.
275     *
276     * @param input  the string which is to be parsed
277     * @param delim  the field delimiter string
278     */
279    public StrTokenizer(final String input, final String delim) {
280        this(input);
281        setDelimiterString(delim);
282    }
283
284    /**
285     * Constructs a tokenizer splitting using the specified delimiter matcher.
286     *
287     * @param input  the string which is to be parsed
288     * @param delim  the field delimiter matcher
289     */
290    public StrTokenizer(final String input, final StrMatcher delim) {
291        this(input);
292        setDelimiterMatcher(delim);
293    }
294
295    /**
296     * Constructs a tokenizer splitting on the specified delimiter character
297     * and handling quotes using the specified quote character.
298     *
299     * @param input  the string which is to be parsed
300     * @param delim  the field delimiter character
301     * @param quote  the field quoted string character
302     */
303    public StrTokenizer(final String input, final char delim, final char quote) {
304        this(input, delim);
305        setQuoteChar(quote);
306    }
307
308    /**
309     * Constructs a tokenizer splitting using the specified delimiter matcher
310     * and handling quotes using the specified quote matcher.
311     *
312     * @param input  the string which is to be parsed
313     * @param delim  the field delimiter matcher
314     * @param quote  the field quoted string matcher
315     */
316    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
317        this(input, delim);
318        setQuoteMatcher(quote);
319    }
320
321    /**
322     * Constructs a tokenizer splitting on space, tab, newline and formfeed
323     * as per StringTokenizer.
324     *
325     * @param input  the string which is to be parsed, not cloned
326     */
327    public StrTokenizer(final char[] input) {
328        super();
329        this.chars = ArrayUtils.clone(input);
330    }
331
332    /**
333     * Constructs a tokenizer splitting on the specified character.
334     *
335     * @param input  the string which is to be parsed, not cloned
336     * @param delim the field delimiter character
337     */
338    public StrTokenizer(final char[] input, final char delim) {
339        this(input);
340        setDelimiterChar(delim);
341    }
342
343    /**
344     * Constructs a tokenizer splitting on the specified string.
345     *
346     * @param input  the string which is to be parsed, not cloned
347     * @param delim the field delimiter string
348     */
349    public StrTokenizer(final char[] input, final String delim) {
350        this(input);
351        setDelimiterString(delim);
352    }
353
354    /**
355     * Constructs a tokenizer splitting using the specified delimiter matcher.
356     *
357     * @param input  the string which is to be parsed, not cloned
358     * @param delim  the field delimiter matcher
359     */
360    public StrTokenizer(final char[] input, final StrMatcher delim) {
361        this(input);
362        setDelimiterMatcher(delim);
363    }
364
365    /**
366     * Constructs a tokenizer splitting on the specified delimiter character
367     * and handling quotes using the specified quote character.
368     *
369     * @param input  the string which is to be parsed, not cloned
370     * @param delim  the field delimiter character
371     * @param quote  the field quoted string character
372     */
373    public StrTokenizer(final char[] input, final char delim, final char quote) {
374        this(input, delim);
375        setQuoteChar(quote);
376    }
377
378    /**
379     * Constructs a tokenizer splitting using the specified delimiter matcher
380     * and handling quotes using the specified quote matcher.
381     *
382     * @param input  the string which is to be parsed, not cloned
383     * @param delim  the field delimiter character
384     * @param quote  the field quoted string character
385     */
386    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
387        this(input, delim);
388        setQuoteMatcher(quote);
389    }
390
391    // API
392    //-----------------------------------------------------------------------
393    /**
394     * Gets the number of tokens found in the String.
395     *
396     * @return the number of matched tokens
397     */
398    public int size() {
399        checkTokenized();
400        return tokens.length;
401    }
402
403    /**
404     * Gets the next token from the String.
405     * Equivalent to {@link #next()} except it returns null rather than
406     * throwing {@link NoSuchElementException} when no tokens remain.
407     *
408     * @return the next sequential token, or null when no more tokens are found
409     */
410    public String nextToken() {
411        if (hasNext()) {
412            return tokens[tokenPos++];
413        }
414        return null;
415    }
416
417    /**
418     * Gets the previous token from the String.
419     *
420     * @return the previous sequential token, or null when no more tokens are found
421     */
422    public String previousToken() {
423        if (hasPrevious()) {
424            return tokens[--tokenPos];
425        }
426        return null;
427    }
428
429    /**
430     * Gets a copy of the full token list as an independent modifiable array.
431     *
432     * @return the tokens as a String array
433     */
434    public String[] getTokenArray() {
435        checkTokenized();
436        return tokens.clone();
437    }
438
439    /**
440     * Gets a copy of the full token list as an independent modifiable list.
441     *
442     * @return the tokens as a String array
443     */
444    public List<String> getTokenList() {
445        checkTokenized();
446        final List<String> list = new ArrayList<>(tokens.length);
447        list.addAll(Arrays.asList(tokens));
448        return list;
449    }
450
451    /**
452     * Resets this tokenizer, forgetting all parsing and iteration already completed.
453     * <p>
454     * This method allows the same tokenizer to be reused for the same String.
455     *
456     * @return this, to enable chaining
457     */
458    public StrTokenizer reset() {
459        tokenPos = 0;
460        tokens = null;
461        return this;
462    }
463
464    /**
465     * Reset this tokenizer, giving it a new input string to parse.
466     * In this manner you can re-use a tokenizer with the same settings
467     * on multiple input lines.
468     *
469     * @param input  the new string to tokenize, null sets no text to parse
470     * @return this, to enable chaining
471     */
472    public StrTokenizer reset(final String input) {
473        reset();
474        if (input != null) {
475            this.chars = input.toCharArray();
476        } else {
477            this.chars = null;
478        }
479        return this;
480    }
481
482    /**
483     * Reset this tokenizer, giving it a new input string to parse.
484     * In this manner you can re-use a tokenizer with the same settings
485     * on multiple input lines.
486     *
487     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
488     * @return this, to enable chaining
489     */
490    public StrTokenizer reset(final char[] input) {
491        reset();
492        this.chars = ArrayUtils.clone(input);
493        return this;
494    }
495
496    // ListIterator
497    //-----------------------------------------------------------------------
498    /**
499     * Checks whether there are any more tokens.
500     *
501     * @return true if there are more tokens
502     */
503    @Override
504    public boolean hasNext() {
505        checkTokenized();
506        return tokenPos < tokens.length;
507    }
508
509    /**
510     * Gets the next token.
511     *
512     * @return the next String token
513     * @throws NoSuchElementException if there are no more elements
514     */
515    @Override
516    public String next() {
517        if (hasNext()) {
518            return tokens[tokenPos++];
519        }
520        throw new NoSuchElementException();
521    }
522
523    /**
524     * Gets the index of the next token to return.
525     *
526     * @return the next token index
527     */
528    @Override
529    public int nextIndex() {
530        return tokenPos;
531    }
532
533    /**
534     * Checks whether there are any previous tokens that can be iterated to.
535     *
536     * @return true if there are previous tokens
537     */
538    @Override
539    public boolean hasPrevious() {
540        checkTokenized();
541        return tokenPos > 0;
542    }
543
544    /**
545     * Gets the token previous to the last returned token.
546     *
547     * @return the previous token
548     */
549    @Override
550    public String previous() {
551        if (hasPrevious()) {
552            return tokens[--tokenPos];
553        }
554        throw new NoSuchElementException();
555    }
556
557    /**
558     * Gets the index of the previous token.
559     *
560     * @return the previous token index
561     */
562    @Override
563    public int previousIndex() {
564        return tokenPos - 1;
565    }
566
567    /**
568     * Unsupported ListIterator operation.
569     *
570     * @throws UnsupportedOperationException always
571     */
572    @Override
573    public void remove() {
574        throw new UnsupportedOperationException("remove() is unsupported");
575    }
576
577    /**
578     * Unsupported ListIterator operation.
579     * @param obj this parameter ignored.
580     * @throws UnsupportedOperationException always
581     */
582    @Override
583    public void set(final String obj) {
584        throw new UnsupportedOperationException("set() is unsupported");
585    }
586
587    /**
588     * Unsupported ListIterator operation.
589     * @param obj this parameter ignored.
590     * @throws UnsupportedOperationException always
591     */
592    @Override
593    public void add(final String obj) {
594        throw new UnsupportedOperationException("add() is unsupported");
595    }
596
597    // Implementation
598    //-----------------------------------------------------------------------
599    /**
600     * Checks if tokenization has been done, and if not then do it.
601     */
602    private void checkTokenized() {
603        if (tokens == null) {
604            if (chars == null) {
605                // still call tokenize as subclass may do some work
606                final List<String> split = tokenize(null, 0, 0);
607                tokens = split.toArray(new String[split.size()]);
608            } else {
609                final List<String> split = tokenize(chars, 0, chars.length);
610                tokens = split.toArray(new String[split.size()]);
611            }
612        }
613    }
614
615    /**
616     * Internal method to performs the tokenization.
617     * <p>
618     * Most users of this class do not need to call this method. This method
619     * will be called automatically by other (public) methods when required.
620     * <p>
621     * This method exists to allow subclasses to add code before or after the
622     * tokenization. For example, a subclass could alter the character array,
623     * offset or count to be parsed, or call the tokenizer multiple times on
624     * multiple strings. It is also be possible to filter the results.
625     * <p>
626     * <code>StrTokenizer</code> will always pass a zero offset and a count
627     * equal to the length of the array to this method, however a subclass
628     * may pass other values, or even an entirely different array.
629     *
630     * @param srcChars  the character array being tokenized, may be null
631     * @param offset  the start position within the character array, must be valid
632     * @param count  the number of characters to tokenize, must be valid
633     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
634     */
635    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
636        if (srcChars == null || count == 0) {
637            return Collections.emptyList();
638        }
639        final StrBuilder buf = new StrBuilder();
640        final List<String> tokenList = new ArrayList<>();
641        int pos = offset;
642
643        // loop around the entire buffer
644        while (pos >= 0 && pos < count) {
645            // find next token
646            pos = readNextToken(srcChars, pos, count, buf, tokenList);
647
648            // handle case where end of string is a delimiter
649            if (pos >= count) {
650                addToken(tokenList, StringUtils.EMPTY);
651            }
652        }
653        return tokenList;
654    }
655
656    /**
657     * Adds a token to a list, paying attention to the parameters we've set.
658     *
659     * @param list  the list to add to
660     * @param tok  the token to add
661     */
662    private void addToken(final List<String> list, String tok) {
663        if (StringUtils.isEmpty(tok)) {
664            if (isIgnoreEmptyTokens()) {
665                return;
666            }
667            if (isEmptyTokenAsNull()) {
668                tok = null;
669            }
670        }
671        list.add(tok);
672    }
673
674    /**
675     * Reads character by character through the String to get the next token.
676     *
677     * @param srcChars  the character array being tokenized
678     * @param start  the first character of field
679     * @param len  the length of the character array being tokenized
680     * @param workArea  a temporary work area
681     * @param tokenList  the list of parsed tokens
682     * @return the starting position of the next field (the character
683     *  immediately after the delimiter), or -1 if end of string found
684     */
685    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
686        // skip all leading whitespace, unless it is the
687        // field delimiter or the quote character
688        while (start < len) {
689            final int removeLen = Math.max(
690                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
691                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
692            if (removeLen == 0 ||
693                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
694                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
695                break;
696            }
697            start += removeLen;
698        }
699
700        // handle reaching end
701        if (start >= len) {
702            addToken(tokenList, StringUtils.EMPTY);
703            return -1;
704        }
705
706        // handle empty token
707        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
708        if (delimLen > 0) {
709            addToken(tokenList, StringUtils.EMPTY);
710            return start + delimLen;
711        }
712
713        // handle found token
714        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
715        if (quoteLen > 0) {
716            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
717        }
718        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
719    }
720
721    /**
722     * Reads a possibly quoted string token.
723     *
724     * @param srcChars  the character array being tokenized
725     * @param start  the first character of field
726     * @param len  the length of the character array being tokenized
727     * @param workArea  a temporary work area
728     * @param tokenList  the list of parsed tokens
729     * @param quoteStart  the start position of the matched quote, 0 if no quoting
730     * @param quoteLen  the length of the matched quote, 0 if no quoting
731     * @return the starting position of the next field (the character
732     *  immediately after the delimiter, or if end of string found,
733     *  then the length of string
734     */
735    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
736                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
737        // Loop until we've found the end of the quoted
738        // string or the end of the input
739        workArea.clear();
740        int pos = start;
741        boolean quoting = quoteLen > 0;
742        int trimStart = 0;
743
744        while (pos < len) {
745            // quoting mode can occur several times throughout a string
746            // we must switch between quoting and non-quoting until we
747            // encounter a non-quoted delimiter, or end of string
748            if (quoting) {
749                // In quoting mode
750
751                // If we've found a quote character, see if it's
752                // followed by a second quote.  If so, then we need
753                // to actually put the quote character into the token
754                // rather than end the token.
755                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
756                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
757                        // matched pair of quotes, thus an escaped quote
758                        workArea.append(srcChars, pos, quoteLen);
759                        pos += quoteLen * 2;
760                        trimStart = workArea.size();
761                        continue;
762                    }
763
764                    // end of quoting
765                    quoting = false;
766                    pos += quoteLen;
767                    continue;
768                }
769
770                // copy regular character from inside quotes
771                workArea.append(srcChars[pos++]);
772                trimStart = workArea.size();
773
774            } else {
775                // Not in quoting mode
776
777                // check for delimiter, and thus end of token
778                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
779                if (delimLen > 0) {
780                    // return condition when end of token found
781                    addToken(tokenList, workArea.substring(0, trimStart));
782                    return pos + delimLen;
783                }
784
785                // check for quote, and thus back into quoting mode
786                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
787                    quoting = true;
788                    pos += quoteLen;
789                    continue;
790                }
791
792                // check for ignored (outside quotes), and ignore
793                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
794                if (ignoredLen > 0) {
795                    pos += ignoredLen;
796                    continue;
797                }
798
799                // check for trimmed character
800                // don't yet know if its at the end, so copy to workArea
801                // use trimStart to keep track of trim at the end
802                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
803                if (trimmedLen > 0) {
804                    workArea.append(srcChars, pos, trimmedLen);
805                    pos += trimmedLen;
806                    continue;
807                }
808
809                // copy regular character from outside quotes
810                workArea.append(srcChars[pos++]);
811                trimStart = workArea.size();
812            }
813        }
814
815        // return condition when end of string found
816        addToken(tokenList, workArea.substring(0, trimStart));
817        return -1;
818    }
819
820    /**
821     * Checks if the characters at the index specified match the quote
822     * already matched in readNextToken().
823     *
824     * @param srcChars  the character array being tokenized
825     * @param pos  the position to check for a quote
826     * @param len  the length of the character array being tokenized
827     * @param quoteStart  the start position of the matched quote, 0 if no quoting
828     * @param quoteLen  the length of the matched quote, 0 if no quoting
829     * @return true if a quote is matched
830     */
831    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
832        for (int i = 0; i < quoteLen; i++) {
833            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
834                return false;
835            }
836        }
837        return true;
838    }
839
840    // Delimiter
841    //-----------------------------------------------------------------------
842    /**
843     * Gets the field delimiter matcher.
844     *
845     * @return the delimiter matcher in use
846     */
847    public StrMatcher getDelimiterMatcher() {
848        return this.delimMatcher;
849    }
850
851    /**
852     * Sets the field delimiter matcher.
853     * <p>
854     * The delimiter is used to separate one token from another.
855     *
856     * @param delim  the delimiter matcher to use
857     * @return this, to enable chaining
858     */
859    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
860        if (delim == null) {
861            this.delimMatcher = StrMatcher.noneMatcher();
862        } else {
863            this.delimMatcher = delim;
864        }
865        return this;
866    }
867
868    /**
869     * Sets the field delimiter character.
870     *
871     * @param delim  the delimiter character to use
872     * @return this, to enable chaining
873     */
874    public StrTokenizer setDelimiterChar(final char delim) {
875        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
876    }
877
878    /**
879     * Sets the field delimiter string.
880     *
881     * @param delim  the delimiter string to use
882     * @return this, to enable chaining
883     */
884    public StrTokenizer setDelimiterString(final String delim) {
885        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
886    }
887
888    // Quote
889    //-----------------------------------------------------------------------
890    /**
891     * Gets the quote matcher currently in use.
892     * <p>
893     * The quote character is used to wrap data between the tokens.
894     * This enables delimiters to be entered as data.
895     * The default value is '"' (double quote).
896     *
897     * @return the quote matcher in use
898     */
899    public StrMatcher getQuoteMatcher() {
900        return quoteMatcher;
901    }
902
903    /**
904     * Set the quote matcher to use.
905     * <p>
906     * The quote character is used to wrap data between the tokens.
907     * This enables delimiters to be entered as data.
908     *
909     * @param quote  the quote matcher to use, null ignored
910     * @return this, to enable chaining
911     */
912    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
913        if (quote != null) {
914            this.quoteMatcher = quote;
915        }
916        return this;
917    }
918
919    /**
920     * Sets the quote character to use.
921     * <p>
922     * The quote character is used to wrap data between the tokens.
923     * This enables delimiters to be entered as data.
924     *
925     * @param quote  the quote character to use
926     * @return this, to enable chaining
927     */
928    public StrTokenizer setQuoteChar(final char quote) {
929        return setQuoteMatcher(StrMatcher.charMatcher(quote));
930    }
931
932    // Ignored
933    //-----------------------------------------------------------------------
934    /**
935     * Gets the ignored character matcher.
936     * <p>
937     * These characters are ignored when parsing the String, unless they are
938     * within a quoted region.
939     * The default value is not to ignore anything.
940     *
941     * @return the ignored matcher in use
942     */
943    public StrMatcher getIgnoredMatcher() {
944        return ignoredMatcher;
945    }
946
947    /**
948     * Set the matcher for characters to ignore.
949     * <p>
950     * These characters are ignored when parsing the String, unless they are
951     * within a quoted region.
952     *
953     * @param ignored  the ignored matcher to use, null ignored
954     * @return this, to enable chaining
955     */
956    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
957        if (ignored != null) {
958            this.ignoredMatcher = ignored;
959        }
960        return this;
961    }
962
963    /**
964     * Set the character to ignore.
965     * <p>
966     * This character is ignored when parsing the String, unless it is
967     * within a quoted region.
968     *
969     * @param ignored  the ignored character to use
970     * @return this, to enable chaining
971     */
972    public StrTokenizer setIgnoredChar(final char ignored) {
973        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974    }
975
976    // Trimmer
977    //-----------------------------------------------------------------------
978    /**
979     * Gets the trimmer character matcher.
980     * <p>
981     * These characters are trimmed off on each side of the delimiter
982     * until the token or quote is found.
983     * The default value is not to trim anything.
984     *
985     * @return the trimmer matcher in use
986     */
987    public StrMatcher getTrimmerMatcher() {
988        return trimmerMatcher;
989    }
990
991    /**
992     * Sets the matcher for characters to trim.
993     * <p>
994     * These characters are trimmed off on each side of the delimiter
995     * until the token or quote is found.
996     *
997     * @param trimmer  the trimmer matcher to use, null ignored
998     * @return this, to enable chaining
999     */
1000    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1001        if (trimmer != null) {
1002            this.trimmerMatcher = trimmer;
1003        }
1004        return this;
1005    }
1006
1007    //-----------------------------------------------------------------------
1008    /**
1009     * Gets whether the tokenizer currently returns empty tokens as null.
1010     * The default for this property is false.
1011     *
1012     * @return true if empty tokens are returned as null
1013     */
1014    public boolean isEmptyTokenAsNull() {
1015        return this.emptyAsNull;
1016    }
1017
1018    /**
1019     * Sets whether the tokenizer should return empty tokens as null.
1020     * The default for this property is false.
1021     *
1022     * @param emptyAsNull  whether empty tokens are returned as null
1023     * @return this, to enable chaining
1024     */
1025    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1026        this.emptyAsNull = emptyAsNull;
1027        return this;
1028    }
1029
1030    //-----------------------------------------------------------------------
1031    /**
1032     * Gets whether the tokenizer currently ignores empty tokens.
1033     * The default for this property is true.
1034     *
1035     * @return true if empty tokens are not returned
1036     */
1037    public boolean isIgnoreEmptyTokens() {
1038        return ignoreEmptyTokens;
1039    }
1040
1041    /**
1042     * Sets whether the tokenizer should ignore and not return empty tokens.
1043     * The default for this property is true.
1044     *
1045     * @param ignoreEmptyTokens  whether empty tokens are not returned
1046     * @return this, to enable chaining
1047     */
1048    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1049        this.ignoreEmptyTokens = ignoreEmptyTokens;
1050        return this;
1051    }
1052
1053    //-----------------------------------------------------------------------
1054    /**
1055     * Gets the String content that the tokenizer is parsing.
1056     *
1057     * @return the string content being parsed
1058     */
1059    public String getContent() {
1060        if (chars == null) {
1061            return null;
1062        }
1063        return new String(chars);
1064    }
1065
1066    //-----------------------------------------------------------------------
1067    /**
1068     * Creates a new instance of this Tokenizer. The new instance is reset so
1069     * that it will be at the start of the token list.
1070     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1071     *
1072     * @return a new instance of this Tokenizer which has been reset.
1073     */
1074    @Override
1075    public Object clone() {
1076        try {
1077            return cloneReset();
1078        } catch (final CloneNotSupportedException ex) {
1079            return null;
1080        }
1081    }
1082
1083    /**
1084     * Creates a new instance of this Tokenizer. The new instance is reset so that
1085     * it will be at the start of the token list.
1086     *
1087     * @return a new instance of this Tokenizer which has been reset.
1088     * @throws CloneNotSupportedException if there is a problem cloning
1089     */
1090    Object cloneReset() throws CloneNotSupportedException {
1091        // this method exists to enable 100% test coverage
1092        final StrTokenizer cloned = (StrTokenizer) super.clone();
1093        if (cloned.chars != null) {
1094            cloned.chars = cloned.chars.clone();
1095        }
1096        cloned.reset();
1097        return cloned;
1098    }
1099
1100    //-----------------------------------------------------------------------
1101    /**
1102     * Gets the String content that the tokenizer is parsing.
1103     *
1104     * @return the string content being parsed
1105     */
1106    @Override
1107    public String toString() {
1108        if (tokens == null) {
1109            return "StrTokenizer[not tokenized yet]";
1110        }
1111        return "StrTokenizer" + getTokenList();
1112    }
1113
1114}