001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025
026import org.apache.commons.lang3.ArrayUtils;
027import org.apache.commons.lang3.StringUtils;
028
029/**
030 * Tokenizes a string based on delimiters (separators)
031 * and supporting quoting and ignored character concepts.
032 * <p>
033 * This class can split a String into many smaller strings. It aims
034 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
035 * however it offers much more control and flexibility including implementing
036 * the {@code ListIterator} interface. By default, it is set up
037 * like {@code StringTokenizer}.
038 * <p>
039 * The input String is split into a number of <i>tokens</i>.
040 * Each token is separated from the next String by a <i>delimiter</i>.
041 * One or more delimiter characters must be specified.
042 * <p>
043 * Each token may be surrounded by quotes.
044 * The <i>quote</i> matcher specifies the quote character(s).
045 * A quote may be escaped within a quoted section by duplicating itself.
046 * <p>
047 * Between each token and the delimiter are potentially characters that need trimming.
048 * The <i>trimmer</i> matcher specifies these characters.
049 * One usage might be to trim whitespace characters.
050 * <p>
051 * At any point outside the quotes there might potentially be invalid characters.
052 * The <i>ignored</i> matcher specifies these characters to be removed.
053 * One usage might be to remove new line characters.
054 * <p>
055 * Empty tokens may be removed or returned as null.
056 * <pre>
057 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
058 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
059 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
060 * </pre>
061 *
062 * <table>
063 *  <caption>StrTokenizer properties and options</caption>
064 *  <tr>
065 *   <th>Property</th><th>Type</th><th>Default</th>
066 *  </tr>
067 *  <tr>
068 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069 *  </tr>
070 *  <tr>
071 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
072 *  </tr>
073 *  <tr>
074 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075 *  </tr>
076 *  <tr>
077 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078 *  </tr>
079 *  <tr>
080 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081 *  </tr>
082 * </table>
083 *
084 * @since 2.2
085 * @deprecated as of 3.6, use commons-text
086 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
087 * StringTokenizer</a> instead
088 */
089@Deprecated
090public class StrTokenizer implements ListIterator<String>, Cloneable {
091
092    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
093    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
094    static {
095        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
096        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
097        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
098        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
099        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
100        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
101        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
102
103        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
104        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
105        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
106        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
107        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
108        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
109        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
110    }
111
112    /** The text to work on. */
113    private char[] chars;
114    /** The parsed tokens */
115    private String[] tokens;
116    /** The current iteration position */
117    private int tokenPos;
118
119    /** The delimiter matcher */
120    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
121    /** The quote matcher */
122    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
123    /** The ignored matcher */
124    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
125    /** The trimmer matcher */
126    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
127
128    /** Whether to return empty tokens as null */
129    private boolean emptyAsNull;
130    /** Whether to ignore empty tokens */
131    private boolean ignoreEmptyTokens = true;
132
133    //-----------------------------------------------------------------------
134
135    /**
136     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
137     *
138     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
139     */
140    private static StrTokenizer getCSVClone() {
141        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
142    }
143
144    /**
145     * Gets a new tokenizer instance which parses Comma Separated Value strings
146     * initializing it with the given input.  The default for CSV processing
147     * will be trim whitespace from both ends (which can be overridden with
148     * the setTrimmer method).
149     * <p>
150     * You must call a "reset" method to set the string which you want to parse.
151     * @return a new tokenizer instance which parses Comma Separated Value strings
152     */
153    public static StrTokenizer getCSVInstance() {
154        return getCSVClone();
155    }
156
157    /**
158     * Gets a new tokenizer instance which parses Comma Separated Value strings
159     * initializing it with the given input.  The default for CSV processing
160     * will be trim whitespace from both ends (which can be overridden with
161     * the setTrimmer method).
162     *
163     * @param input  the text to parse
164     * @return a new tokenizer instance which parses Comma Separated Value strings
165     */
166    public static StrTokenizer getCSVInstance(final String input) {
167        final StrTokenizer tok = getCSVClone();
168        tok.reset(input);
169        return tok;
170    }
171
172    /**
173     * Gets a new tokenizer instance which parses Comma Separated Value strings
174     * initializing it with the given input.  The default for CSV processing
175     * will be trim whitespace from both ends (which can be overridden with
176     * the setTrimmer method).
177     *
178     * @param input  the text to parse
179     * @return a new tokenizer instance which parses Comma Separated Value strings
180     */
181    public static StrTokenizer getCSVInstance(final char[] input) {
182        final StrTokenizer tok = getCSVClone();
183        tok.reset(input);
184        return tok;
185    }
186
187    /**
188     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
189     *
190     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
191     */
192    private static StrTokenizer getTSVClone() {
193        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
194    }
195
196
197    /**
198     * Gets a new tokenizer instance which parses Tab Separated Value strings.
199     * The default for CSV processing will be trim whitespace from both ends
200     * (which can be overridden with the setTrimmer method).
201     * <p>
202     * You must call a "reset" method to set the string which you want to parse.
203     * @return a new tokenizer instance which parses Tab Separated Value strings.
204     */
205    public static StrTokenizer getTSVInstance() {
206        return getTSVClone();
207    }
208
209    /**
210     * Gets a new tokenizer instance which parses Tab Separated Value strings.
211     * The default for CSV processing will be trim whitespace from both ends
212     * (which can be overridden with the setTrimmer method).
213     * @param input  the string to parse
214     * @return a new tokenizer instance which parses Tab Separated Value strings.
215     */
216    public static StrTokenizer getTSVInstance(final String input) {
217        final StrTokenizer tok = getTSVClone();
218        tok.reset(input);
219        return tok;
220    }
221
222    /**
223     * Gets a new tokenizer instance which parses Tab Separated Value strings.
224     * The default for CSV processing will be trim whitespace from both ends
225     * (which can be overridden with the setTrimmer method).
226     * @param input  the string to parse
227     * @return a new tokenizer instance which parses Tab Separated Value strings.
228     */
229    public static StrTokenizer getTSVInstance(final char[] input) {
230        final StrTokenizer tok = getTSVClone();
231        tok.reset(input);
232        return tok;
233    }
234
235    //-----------------------------------------------------------------------
236    /**
237     * Constructs a tokenizer splitting on space, tab, newline and formfeed
238     * as per StringTokenizer, but with no text to tokenize.
239     * <p>
240     * This constructor is normally used with {@link #reset(String)}.
241     */
242    public StrTokenizer() {
243        this.chars = null;
244    }
245
246    /**
247     * Constructs a tokenizer splitting on space, tab, newline and formfeed
248     * as per StringTokenizer.
249     *
250     * @param input  the string which is to be parsed
251     */
252    public StrTokenizer(final String input) {
253        if (input != null) {
254            chars = input.toCharArray();
255        } else {
256            chars = null;
257        }
258    }
259
260    /**
261     * Constructs a tokenizer splitting on the specified delimiter character.
262     *
263     * @param input  the string which is to be parsed
264     * @param delim  the field delimiter character
265     */
266    public StrTokenizer(final String input, final char delim) {
267        this(input);
268        setDelimiterChar(delim);
269    }
270
271    /**
272     * Constructs a tokenizer splitting on the specified delimiter string.
273     *
274     * @param input  the string which is to be parsed
275     * @param delim  the field delimiter string
276     */
277    public StrTokenizer(final String input, final String delim) {
278        this(input);
279        setDelimiterString(delim);
280    }
281
282    /**
283     * Constructs a tokenizer splitting using the specified delimiter matcher.
284     *
285     * @param input  the string which is to be parsed
286     * @param delim  the field delimiter matcher
287     */
288    public StrTokenizer(final String input, final StrMatcher delim) {
289        this(input);
290        setDelimiterMatcher(delim);
291    }
292
293    /**
294     * Constructs a tokenizer splitting on the specified delimiter character
295     * and handling quotes using the specified quote character.
296     *
297     * @param input  the string which is to be parsed
298     * @param delim  the field delimiter character
299     * @param quote  the field quoted string character
300     */
301    public StrTokenizer(final String input, final char delim, final char quote) {
302        this(input, delim);
303        setQuoteChar(quote);
304    }
305
306    /**
307     * Constructs a tokenizer splitting using the specified delimiter matcher
308     * and handling quotes using the specified quote matcher.
309     *
310     * @param input  the string which is to be parsed
311     * @param delim  the field delimiter matcher
312     * @param quote  the field quoted string matcher
313     */
314    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
315        this(input, delim);
316        setQuoteMatcher(quote);
317    }
318
319    /**
320     * Constructs a tokenizer splitting on space, tab, newline and formfeed
321     * as per StringTokenizer.
322     *
323     * @param input  the string which is to be parsed, not cloned
324     */
325    public StrTokenizer(final char[] input) {
326        this.chars = ArrayUtils.clone(input);
327    }
328
329    /**
330     * Constructs a tokenizer splitting on the specified character.
331     *
332     * @param input  the string which is to be parsed, not cloned
333     * @param delim the field delimiter character
334     */
335    public StrTokenizer(final char[] input, final char delim) {
336        this(input);
337        setDelimiterChar(delim);
338    }
339
340    /**
341     * Constructs a tokenizer splitting on the specified string.
342     *
343     * @param input  the string which is to be parsed, not cloned
344     * @param delim the field delimiter string
345     */
346    public StrTokenizer(final char[] input, final String delim) {
347        this(input);
348        setDelimiterString(delim);
349    }
350
351    /**
352     * Constructs a tokenizer splitting using the specified delimiter matcher.
353     *
354     * @param input  the string which is to be parsed, not cloned
355     * @param delim  the field delimiter matcher
356     */
357    public StrTokenizer(final char[] input, final StrMatcher delim) {
358        this(input);
359        setDelimiterMatcher(delim);
360    }
361
362    /**
363     * Constructs a tokenizer splitting on the specified delimiter character
364     * and handling quotes using the specified quote character.
365     *
366     * @param input  the string which is to be parsed, not cloned
367     * @param delim  the field delimiter character
368     * @param quote  the field quoted string character
369     */
370    public StrTokenizer(final char[] input, final char delim, final char quote) {
371        this(input, delim);
372        setQuoteChar(quote);
373    }
374
375    /**
376     * Constructs a tokenizer splitting using the specified delimiter matcher
377     * and handling quotes using the specified quote matcher.
378     *
379     * @param input  the string which is to be parsed, not cloned
380     * @param delim  the field delimiter character
381     * @param quote  the field quoted string character
382     */
383    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
384        this(input, delim);
385        setQuoteMatcher(quote);
386    }
387
388    // API
389    //-----------------------------------------------------------------------
390    /**
391     * Gets the number of tokens found in the String.
392     *
393     * @return the number of matched tokens
394     */
395    public int size() {
396        checkTokenized();
397        return tokens.length;
398    }
399
400    /**
401     * Gets the next token from the String.
402     * Equivalent to {@link #next()} except it returns null rather than
403     * throwing {@link NoSuchElementException} when no tokens remain.
404     *
405     * @return the next sequential token, or null when no more tokens are found
406     */
407    public String nextToken() {
408        if (hasNext()) {
409            return tokens[tokenPos++];
410        }
411        return null;
412    }
413
414    /**
415     * Gets the previous token from the String.
416     *
417     * @return the previous sequential token, or null when no more tokens are found
418     */
419    public String previousToken() {
420        if (hasPrevious()) {
421            return tokens[--tokenPos];
422        }
423        return null;
424    }
425
426    /**
427     * Gets a copy of the full token list as an independent modifiable array.
428     *
429     * @return the tokens as a String array
430     */
431    public String[] getTokenArray() {
432        checkTokenized();
433        return tokens.clone();
434    }
435
436    /**
437     * Gets a copy of the full token list as an independent modifiable list.
438     *
439     * @return the tokens as a String array
440     */
441    public List<String> getTokenList() {
442        checkTokenized();
443        final List<String> list = new ArrayList<>(tokens.length);
444        list.addAll(Arrays.asList(tokens));
445        return list;
446    }
447
448    /**
449     * Resets this tokenizer, forgetting all parsing and iteration already completed.
450     * <p>
451     * This method allows the same tokenizer to be reused for the same String.
452     *
453     * @return this, to enable chaining
454     */
455    public StrTokenizer reset() {
456        tokenPos = 0;
457        tokens = null;
458        return this;
459    }
460
461    /**
462     * Reset this tokenizer, giving it a new input string to parse.
463     * In this manner you can re-use a tokenizer with the same settings
464     * on multiple input lines.
465     *
466     * @param input  the new string to tokenize, null sets no text to parse
467     * @return this, to enable chaining
468     */
469    public StrTokenizer reset(final String input) {
470        reset();
471        if (input != null) {
472            this.chars = input.toCharArray();
473        } else {
474            this.chars = null;
475        }
476        return this;
477    }
478
479    /**
480     * Reset this tokenizer, giving it a new input string to parse.
481     * In this manner you can re-use a tokenizer with the same settings
482     * on multiple input lines.
483     *
484     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
485     * @return this, to enable chaining
486     */
487    public StrTokenizer reset(final char[] input) {
488        reset();
489        this.chars = ArrayUtils.clone(input);
490        return this;
491    }
492
493    // ListIterator
494    //-----------------------------------------------------------------------
495    /**
496     * Checks whether there are any more tokens.
497     *
498     * @return true if there are more tokens
499     */
500    @Override
501    public boolean hasNext() {
502        checkTokenized();
503        return tokenPos < tokens.length;
504    }
505
506    /**
507     * Gets the next token.
508     *
509     * @return the next String token
510     * @throws NoSuchElementException if there are no more elements
511     */
512    @Override
513    public String next() {
514        if (hasNext()) {
515            return tokens[tokenPos++];
516        }
517        throw new NoSuchElementException();
518    }
519
520    /**
521     * Gets the index of the next token to return.
522     *
523     * @return the next token index
524     */
525    @Override
526    public int nextIndex() {
527        return tokenPos;
528    }
529
530    /**
531     * Checks whether there are any previous tokens that can be iterated to.
532     *
533     * @return true if there are previous tokens
534     */
535    @Override
536    public boolean hasPrevious() {
537        checkTokenized();
538        return tokenPos > 0;
539    }
540
541    /**
542     * Gets the token previous to the last returned token.
543     *
544     * @return the previous token
545     */
546    @Override
547    public String previous() {
548        if (hasPrevious()) {
549            return tokens[--tokenPos];
550        }
551        throw new NoSuchElementException();
552    }
553
554    /**
555     * Gets the index of the previous token.
556     *
557     * @return the previous token index
558     */
559    @Override
560    public int previousIndex() {
561        return tokenPos - 1;
562    }
563
564    /**
565     * Unsupported ListIterator operation.
566     *
567     * @throws UnsupportedOperationException always
568     */
569    @Override
570    public void remove() {
571        throw new UnsupportedOperationException("remove() is unsupported");
572    }
573
574    /**
575     * Unsupported ListIterator operation.
576     * @param obj this parameter ignored.
577     * @throws UnsupportedOperationException always
578     */
579    @Override
580    public void set(final String obj) {
581        throw new UnsupportedOperationException("set() is unsupported");
582    }
583
584    /**
585     * Unsupported ListIterator operation.
586     * @param obj this parameter ignored.
587     * @throws UnsupportedOperationException always
588     */
589    @Override
590    public void add(final String obj) {
591        throw new UnsupportedOperationException("add() is unsupported");
592    }
593
594    // Implementation
595    //-----------------------------------------------------------------------
596    /**
597     * Checks if tokenization has been done, and if not then do it.
598     */
599    private void checkTokenized() {
600        if (tokens == null) {
601            if (chars == null) {
602                // still call tokenize as subclass may do some work
603                final List<String> split = tokenize(null, 0, 0);
604                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
605            } else {
606                final List<String> split = tokenize(chars, 0, chars.length);
607                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
608            }
609        }
610    }
611
612    /**
613     * Internal method to performs the tokenization.
614     * <p>
615     * Most users of this class do not need to call this method. This method
616     * will be called automatically by other (public) methods when required.
617     * <p>
618     * This method exists to allow subclasses to add code before or after the
619     * tokenization. For example, a subclass could alter the character array,
620     * offset or count to be parsed, or call the tokenizer multiple times on
621     * multiple strings. It is also be possible to filter the results.
622     * <p>
623     * {@code StrTokenizer} will always pass a zero offset and a count
624     * equal to the length of the array to this method, however a subclass
625     * may pass other values, or even an entirely different array.
626     *
627     * @param srcChars  the character array being tokenized, may be null
628     * @param offset  the start position within the character array, must be valid
629     * @param count  the number of characters to tokenize, must be valid
630     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
631     */
632    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
633        if (srcChars == null || count == 0) {
634            return Collections.emptyList();
635        }
636        final StrBuilder buf = new StrBuilder();
637        final List<String> tokenList = new ArrayList<>();
638        int pos = offset;
639
640        // loop around the entire buffer
641        while (pos >= 0 && pos < count) {
642            // find next token
643            pos = readNextToken(srcChars, pos, count, buf, tokenList);
644
645            // handle case where end of string is a delimiter
646            if (pos >= count) {
647                addToken(tokenList, StringUtils.EMPTY);
648            }
649        }
650        return tokenList;
651    }
652
653    /**
654     * Adds a token to a list, paying attention to the parameters we've set.
655     *
656     * @param list  the list to add to
657     * @param tok  the token to add
658     */
659    private void addToken(final List<String> list, String tok) {
660        if (StringUtils.isEmpty(tok)) {
661            if (isIgnoreEmptyTokens()) {
662                return;
663            }
664            if (isEmptyTokenAsNull()) {
665                tok = null;
666            }
667        }
668        list.add(tok);
669    }
670
671    /**
672     * Reads character by character through the String to get the next token.
673     *
674     * @param srcChars  the character array being tokenized
675     * @param start  the first character of field
676     * @param len  the length of the character array being tokenized
677     * @param workArea  a temporary work area
678     * @param tokenList  the list of parsed tokens
679     * @return the starting position of the next field (the character
680     *  immediately after the delimiter), or -1 if end of string found
681     */
682    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
683        // skip all leading whitespace, unless it is the
684        // field delimiter or the quote character
685        while (start < len) {
686            final int removeLen = Math.max(
687                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
688                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
689            if (removeLen == 0 ||
690                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
691                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
692                break;
693            }
694            start += removeLen;
695        }
696
697        // handle reaching end
698        if (start >= len) {
699            addToken(tokenList, StringUtils.EMPTY);
700            return -1;
701        }
702
703        // handle empty token
704        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
705        if (delimLen > 0) {
706            addToken(tokenList, StringUtils.EMPTY);
707            return start + delimLen;
708        }
709
710        // handle found token
711        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
712        if (quoteLen > 0) {
713            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
714        }
715        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
716    }
717
718    /**
719     * Reads a possibly quoted string token.
720     *
721     * @param srcChars  the character array being tokenized
722     * @param start  the first character of field
723     * @param len  the length of the character array being tokenized
724     * @param workArea  a temporary work area
725     * @param tokenList  the list of parsed tokens
726     * @param quoteStart  the start position of the matched quote, 0 if no quoting
727     * @param quoteLen  the length of the matched quote, 0 if no quoting
728     * @return the starting position of the next field (the character
729     *  immediately after the delimiter, or if end of string found,
730     *  then the length of string
731     */
732    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
733                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
734        // Loop until we've found the end of the quoted
735        // string or the end of the input
736        workArea.clear();
737        int pos = start;
738        boolean quoting = quoteLen > 0;
739        int trimStart = 0;
740
741        while (pos < len) {
742            // quoting mode can occur several times throughout a string
743            // we must switch between quoting and non-quoting until we
744            // encounter a non-quoted delimiter, or end of string
745            if (quoting) {
746                // In quoting mode
747
748                // If we've found a quote character, see if it's
749                // followed by a second quote.  If so, then we need
750                // to actually put the quote character into the token
751                // rather than end the token.
752                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
753                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
754                        // matched pair of quotes, thus an escaped quote
755                        workArea.append(srcChars, pos, quoteLen);
756                        pos += quoteLen * 2;
757                        trimStart = workArea.size();
758                        continue;
759                    }
760
761                    // end of quoting
762                    quoting = false;
763                    pos += quoteLen;
764                    continue;
765                }
766
767                // copy regular character from inside quotes
768                workArea.append(srcChars[pos++]);
769                trimStart = workArea.size();
770
771            } else {
772                // Not in quoting mode
773
774                // check for delimiter, and thus end of token
775                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
776                if (delimLen > 0) {
777                    // return condition when end of token found
778                    addToken(tokenList, workArea.substring(0, trimStart));
779                    return pos + delimLen;
780                }
781
782                // check for quote, and thus back into quoting mode
783                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
784                    quoting = true;
785                    pos += quoteLen;
786                    continue;
787                }
788
789                // check for ignored (outside quotes), and ignore
790                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
791                if (ignoredLen > 0) {
792                    pos += ignoredLen;
793                    continue;
794                }
795
796                // check for trimmed character
797                // don't yet know if its at the end, so copy to workArea
798                // use trimStart to keep track of trim at the end
799                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
800                if (trimmedLen > 0) {
801                    workArea.append(srcChars, pos, trimmedLen);
802                    pos += trimmedLen;
803                    continue;
804                }
805
806                // copy regular character from outside quotes
807                workArea.append(srcChars[pos++]);
808                trimStart = workArea.size();
809            }
810        }
811
812        // return condition when end of string found
813        addToken(tokenList, workArea.substring(0, trimStart));
814        return -1;
815    }
816
817    /**
818     * Checks if the characters at the index specified match the quote
819     * already matched in readNextToken().
820     *
821     * @param srcChars  the character array being tokenized
822     * @param pos  the position to check for a quote
823     * @param len  the length of the character array being tokenized
824     * @param quoteStart  the start position of the matched quote, 0 if no quoting
825     * @param quoteLen  the length of the matched quote, 0 if no quoting
826     * @return true if a quote is matched
827     */
828    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
829        for (int i = 0; i < quoteLen; i++) {
830            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
831                return false;
832            }
833        }
834        return true;
835    }
836
837    // Delimiter
838    //-----------------------------------------------------------------------
839    /**
840     * Gets the field delimiter matcher.
841     *
842     * @return the delimiter matcher in use
843     */
844    public StrMatcher getDelimiterMatcher() {
845        return this.delimMatcher;
846    }
847
848    /**
849     * Sets the field delimiter matcher.
850     * <p>
851     * The delimiter is used to separate one token from another.
852     *
853     * @param delim  the delimiter matcher to use
854     * @return this, to enable chaining
855     */
856    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
857        if (delim == null) {
858            this.delimMatcher = StrMatcher.noneMatcher();
859        } else {
860            this.delimMatcher = delim;
861        }
862        return this;
863    }
864
865    /**
866     * Sets the field delimiter character.
867     *
868     * @param delim  the delimiter character to use
869     * @return this, to enable chaining
870     */
871    public StrTokenizer setDelimiterChar(final char delim) {
872        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
873    }
874
875    /**
876     * Sets the field delimiter string.
877     *
878     * @param delim  the delimiter string to use
879     * @return this, to enable chaining
880     */
881    public StrTokenizer setDelimiterString(final String delim) {
882        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
883    }
884
885    // Quote
886    //-----------------------------------------------------------------------
887    /**
888     * Gets the quote matcher currently in use.
889     * <p>
890     * The quote character is used to wrap data between the tokens.
891     * This enables delimiters to be entered as data.
892     * The default value is '"' (double quote).
893     *
894     * @return the quote matcher in use
895     */
896    public StrMatcher getQuoteMatcher() {
897        return quoteMatcher;
898    }
899
900    /**
901     * Set the quote matcher to use.
902     * <p>
903     * The quote character is used to wrap data between the tokens.
904     * This enables delimiters to be entered as data.
905     *
906     * @param quote  the quote matcher to use, null ignored
907     * @return this, to enable chaining
908     */
909    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
910        if (quote != null) {
911            this.quoteMatcher = quote;
912        }
913        return this;
914    }
915
916    /**
917     * Sets the quote character to use.
918     * <p>
919     * The quote character is used to wrap data between the tokens.
920     * This enables delimiters to be entered as data.
921     *
922     * @param quote  the quote character to use
923     * @return this, to enable chaining
924     */
925    public StrTokenizer setQuoteChar(final char quote) {
926        return setQuoteMatcher(StrMatcher.charMatcher(quote));
927    }
928
929    // Ignored
930    //-----------------------------------------------------------------------
931    /**
932     * Gets the ignored character matcher.
933     * <p>
934     * These characters are ignored when parsing the String, unless they are
935     * within a quoted region.
936     * The default value is not to ignore anything.
937     *
938     * @return the ignored matcher in use
939     */
940    public StrMatcher getIgnoredMatcher() {
941        return ignoredMatcher;
942    }
943
944    /**
945     * Set the matcher for characters to ignore.
946     * <p>
947     * These characters are ignored when parsing the String, unless they are
948     * within a quoted region.
949     *
950     * @param ignored  the ignored matcher to use, null ignored
951     * @return this, to enable chaining
952     */
953    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
954        if (ignored != null) {
955            this.ignoredMatcher = ignored;
956        }
957        return this;
958    }
959
960    /**
961     * Set the character to ignore.
962     * <p>
963     * This character is ignored when parsing the String, unless it is
964     * within a quoted region.
965     *
966     * @param ignored  the ignored character to use
967     * @return this, to enable chaining
968     */
969    public StrTokenizer setIgnoredChar(final char ignored) {
970        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
971    }
972
973    // Trimmer
974    //-----------------------------------------------------------------------
975    /**
976     * Gets the trimmer character matcher.
977     * <p>
978     * These characters are trimmed off on each side of the delimiter
979     * until the token or quote is found.
980     * The default value is not to trim anything.
981     *
982     * @return the trimmer matcher in use
983     */
984    public StrMatcher getTrimmerMatcher() {
985        return trimmerMatcher;
986    }
987
988    /**
989     * Sets the matcher for characters to trim.
990     * <p>
991     * These characters are trimmed off on each side of the delimiter
992     * until the token or quote is found.
993     *
994     * @param trimmer  the trimmer matcher to use, null ignored
995     * @return this, to enable chaining
996     */
997    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
998        if (trimmer != null) {
999            this.trimmerMatcher = trimmer;
1000        }
1001        return this;
1002    }
1003
1004    //-----------------------------------------------------------------------
1005    /**
1006     * Gets whether the tokenizer currently returns empty tokens as null.
1007     * The default for this property is false.
1008     *
1009     * @return true if empty tokens are returned as null
1010     */
1011    public boolean isEmptyTokenAsNull() {
1012        return this.emptyAsNull;
1013    }
1014
1015    /**
1016     * Sets whether the tokenizer should return empty tokens as null.
1017     * The default for this property is false.
1018     *
1019     * @param emptyAsNull  whether empty tokens are returned as null
1020     * @return this, to enable chaining
1021     */
1022    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1023        this.emptyAsNull = emptyAsNull;
1024        return this;
1025    }
1026
1027    //-----------------------------------------------------------------------
1028    /**
1029     * Gets whether the tokenizer currently ignores empty tokens.
1030     * The default for this property is true.
1031     *
1032     * @return true if empty tokens are not returned
1033     */
1034    public boolean isIgnoreEmptyTokens() {
1035        return ignoreEmptyTokens;
1036    }
1037
1038    /**
1039     * Sets whether the tokenizer should ignore and not return empty tokens.
1040     * The default for this property is true.
1041     *
1042     * @param ignoreEmptyTokens  whether empty tokens are not returned
1043     * @return this, to enable chaining
1044     */
1045    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1046        this.ignoreEmptyTokens = ignoreEmptyTokens;
1047        return this;
1048    }
1049
1050    //-----------------------------------------------------------------------
1051    /**
1052     * Gets the String content that the tokenizer is parsing.
1053     *
1054     * @return the string content being parsed
1055     */
1056    public String getContent() {
1057        if (chars == null) {
1058            return null;
1059        }
1060        return new String(chars);
1061    }
1062
1063    //-----------------------------------------------------------------------
1064    /**
1065     * Creates a new instance of this Tokenizer. The new instance is reset so
1066     * that it will be at the start of the token list.
1067     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
1068     *
1069     * @return a new instance of this Tokenizer which has been reset.
1070     */
1071    @Override
1072    public Object clone() {
1073        try {
1074            return cloneReset();
1075        } catch (final CloneNotSupportedException ex) {
1076            return null;
1077        }
1078    }
1079
1080    /**
1081     * Creates a new instance of this Tokenizer. The new instance is reset so that
1082     * it will be at the start of the token list.
1083     *
1084     * @return a new instance of this Tokenizer which has been reset.
1085     * @throws CloneNotSupportedException if there is a problem cloning
1086     */
1087    Object cloneReset() throws CloneNotSupportedException {
1088        // this method exists to enable 100% test coverage
1089        final StrTokenizer cloned = (StrTokenizer) super.clone();
1090        if (cloned.chars != null) {
1091            cloned.chars = cloned.chars.clone();
1092        }
1093        cloned.reset();
1094        return cloned;
1095    }
1096
1097    //-----------------------------------------------------------------------
1098    /**
1099     * Gets the String content that the tokenizer is parsing.
1100     *
1101     * @return the string content being parsed
1102     */
1103    @Override
1104    public String toString() {
1105        if (tokens == null) {
1106            return "StrTokenizer[not tokenized yet]";
1107        }
1108        return "StrTokenizer" + getTokenList();
1109    }
1110
1111}