Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025
026import org.apache.commons.lang3.ArrayUtils;
027import org.apache.commons.lang3.StringUtils;
028
029/**
030 * Tokenizes a string based on delimiters (separators)
031 * and supporting quoting and ignored character concepts.
032 * <p>
033 * This class can split a String into many smaller strings. It aims
034 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
035 * however it offers much more control and flexibility including implementing
036 * the <code>ListIterator</code> interface. By default, it is set up
037 * like <code>StringTokenizer</code>.
038 * <p>
039 * The input String is split into a number of <i>tokens</i>.
040 * Each token is separated from the next String by a <i>delimiter</i>.
041 * One or more delimiter characters must be specified.
042 * <p>
043 * Each token may be surrounded by quotes.
044 * The <i>quote</i> matcher specifies the quote character(s).
045 * A quote may be escaped within a quoted section by duplicating itself.
046 * <p>
047 * Between each token and the delimiter are potentially characters that need trimming.
048 * The <i>trimmer</i> matcher specifies these characters.
049 * One usage might be to trim whitespace characters.
050 * <p>
051 * At any point outside the quotes there might potentially be invalid characters.
052 * The <i>ignored</i> matcher specifies these characters to be removed.
053 * One usage might be to remove new line characters.
054 * <p>
055 * Empty tokens may be removed or returned as null.
056 * <pre>
057 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
058 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
059 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
060 * </pre>
061 * <p>
062 *
063 * This tokenizer has the following properties and options:
064 *
065 * <table summary="Tokenizer Properties">
066 *  <tr>
067 *   <th>Property</th><th>Type</th><th>Default</th>
068 *  </tr>
069 *  <tr>
070 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
071 *  </tr>
072 *  <tr>
073 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
074 *  </tr>
075 *  <tr>
076 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
077 *  </tr>
078 *  <tr>
079 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
080 *  </tr>
081 *  <tr>
082 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
083 *  </tr>
084 * </table>
085 *
086 * @since 2.2
087 * @deprecated as of 3.6, use commons-text
088 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StrTokenizer.html">
089 * StrTokenizer</a> instead
090 */
091@Deprecated
092public class StrTokenizer implements ListIterator<String>, Cloneable {
093
094    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
095    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
096    static {
097        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
098        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
099        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
100        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
101        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
102        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
103        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
104
105        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
106        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
107        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
108        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
109        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
110        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
111        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
112    }
113
114    /** The text to work on. */
115    private char chars[];
116    /** The parsed tokens */
117    private String tokens[];
118    /** The current iteration position */
119    private int tokenPos;
120
121    /** The delimiter matcher */
122    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
123    /** The quote matcher */
124    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
125    /** The ignored matcher */
126    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
127    /** The trimmer matcher */
128    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
129
130    /** Whether to return empty tokens as null */
131    private boolean emptyAsNull = false;
132    /** Whether to ignore empty tokens */
133    private boolean ignoreEmptyTokens = true;
134
135    //-----------------------------------------------------------------------
136
137    /**
138     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
139     *
140     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
141     */
142    private static StrTokenizer getCSVClone() {
143        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
144    }
145
146    /**
147     * Gets a new tokenizer instance which parses Comma Separated Value strings
148     * initializing it with the given input.  The default for CSV processing
149     * will be trim whitespace from both ends (which can be overridden with
150     * the setTrimmer method).
151     * <p>
152     * You must call a "reset" method to set the string which you want to parse.
153     * @return a new tokenizer instance which parses Comma Separated Value strings
154     */
155    public static StrTokenizer getCSVInstance() {
156        return getCSVClone();
157    }
158
159    /**
160     * Gets a new tokenizer instance which parses Comma Separated Value strings
161     * initializing it with the given input.  The default for CSV processing
162     * will be trim whitespace from both ends (which can be overridden with
163     * the setTrimmer method).
164     *
165     * @param input  the text to parse
166     * @return a new tokenizer instance which parses Comma Separated Value strings
167     */
168    public static StrTokenizer getCSVInstance(final String input) {
169        final StrTokenizer tok = getCSVClone();
170        tok.reset(input);
171        return tok;
172    }
173
174    /**
175     * Gets a new tokenizer instance which parses Comma Separated Value strings
176     * initializing it with the given input.  The default for CSV processing
177     * will be trim whitespace from both ends (which can be overridden with
178     * the setTrimmer method).
179     *
180     * @param input  the text to parse
181     * @return a new tokenizer instance which parses Comma Separated Value strings
182     */
183    public static StrTokenizer getCSVInstance(final char[] input) {
184        final StrTokenizer tok = getCSVClone();
185        tok.reset(input);
186        return tok;
187    }
188
189    /**
190     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
191     *
192     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
193     */
194    private static StrTokenizer getTSVClone() {
195        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
196    }
197
198
199    /**
200     * Gets a new tokenizer instance which parses Tab Separated Value strings.
201     * The default for CSV processing will be trim whitespace from both ends
202     * (which can be overridden with the setTrimmer method).
203     * <p>
204     * You must call a "reset" method to set the string which you want to parse.
205     * @return a new tokenizer instance which parses Tab Separated Value strings.
206     */
207    public static StrTokenizer getTSVInstance() {
208        return getTSVClone();
209    }
210
211    /**
212     * Gets a new tokenizer instance which parses Tab Separated Value strings.
213     * The default for CSV processing will be trim whitespace from both ends
214     * (which can be overridden with the setTrimmer method).
215     * @param input  the string to parse
216     * @return a new tokenizer instance which parses Tab Separated Value strings.
217     */
218    public static StrTokenizer getTSVInstance(final String input) {
219        final StrTokenizer tok = getTSVClone();
220        tok.reset(input);
221        return tok;
222    }
223
224    /**
225     * Gets a new tokenizer instance which parses Tab Separated Value strings.
226     * The default for CSV processing will be trim whitespace from both ends
227     * (which can be overridden with the setTrimmer method).
228     * @param input  the string to parse
229     * @return a new tokenizer instance which parses Tab Separated Value strings.
230     */
231    public static StrTokenizer getTSVInstance(final char[] input) {
232        final StrTokenizer tok = getTSVClone();
233        tok.reset(input);
234        return tok;
235    }
236
237    //-----------------------------------------------------------------------
238    /**
239     * Constructs a tokenizer splitting on space, tab, newline and formfeed
240     * as per StringTokenizer, but with no text to tokenize.
241     * <p>
242     * This constructor is normally used with {@link #reset(String)}.
243     */
244    public StrTokenizer() {
245        super();
246        this.chars = null;
247    }
248
249    /**
250     * Constructs a tokenizer splitting on space, tab, newline and formfeed
251     * as per StringTokenizer.
252     *
253     * @param input  the string which is to be parsed
254     */
255    public StrTokenizer(final String input) {
256        super();
257        if (input != null) {
258            chars = input.toCharArray();
259        } else {
260            chars = null;
261        }
262    }
263
264    /**
265     * Constructs a tokenizer splitting on the specified delimiter character.
266     *
267     * @param input  the string which is to be parsed
268     * @param delim  the field delimiter character
269     */
270    public StrTokenizer(final String input, final char delim) {
271        this(input);
272        setDelimiterChar(delim);
273    }
274
275    /**
276     * Constructs a tokenizer splitting on the specified delimiter string.
277     *
278     * @param input  the string which is to be parsed
279     * @param delim  the field delimiter string
280     */
281    public StrTokenizer(final String input, final String delim) {
282        this(input);
283        setDelimiterString(delim);
284    }
285
286    /**
287     * Constructs a tokenizer splitting using the specified delimiter matcher.
288     *
289     * @param input  the string which is to be parsed
290     * @param delim  the field delimiter matcher
291     */
292    public StrTokenizer(final String input, final StrMatcher delim) {
293        this(input);
294        setDelimiterMatcher(delim);
295    }
296
297    /**
298     * Constructs a tokenizer splitting on the specified delimiter character
299     * and handling quotes using the specified quote character.
300     *
301     * @param input  the string which is to be parsed
302     * @param delim  the field delimiter character
303     * @param quote  the field quoted string character
304     */
305    public StrTokenizer(final String input, final char delim, final char quote) {
306        this(input, delim);
307        setQuoteChar(quote);
308    }
309
310    /**
311     * Constructs a tokenizer splitting using the specified delimiter matcher
312     * and handling quotes using the specified quote matcher.
313     *
314     * @param input  the string which is to be parsed
315     * @param delim  the field delimiter matcher
316     * @param quote  the field quoted string matcher
317     */
318    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
319        this(input, delim);
320        setQuoteMatcher(quote);
321    }
322
323    /**
324     * Constructs a tokenizer splitting on space, tab, newline and formfeed
325     * as per StringTokenizer.
326     *
327     * @param input  the string which is to be parsed, not cloned
328     */
329    public StrTokenizer(final char[] input) {
330        super();
331        this.chars = ArrayUtils.clone(input);
332    }
333
334    /**
335     * Constructs a tokenizer splitting on the specified character.
336     *
337     * @param input  the string which is to be parsed, not cloned
338     * @param delim the field delimiter character
339     */
340    public StrTokenizer(final char[] input, final char delim) {
341        this(input);
342        setDelimiterChar(delim);
343    }
344
345    /**
346     * Constructs a tokenizer splitting on the specified string.
347     *
348     * @param input  the string which is to be parsed, not cloned
349     * @param delim the field delimiter string
350     */
351    public StrTokenizer(final char[] input, final String delim) {
352        this(input);
353        setDelimiterString(delim);
354    }
355
356    /**
357     * Constructs a tokenizer splitting using the specified delimiter matcher.
358     *
359     * @param input  the string which is to be parsed, not cloned
360     * @param delim  the field delimiter matcher
361     */
362    public StrTokenizer(final char[] input, final StrMatcher delim) {
363        this(input);
364        setDelimiterMatcher(delim);
365    }
366
367    /**
368     * Constructs a tokenizer splitting on the specified delimiter character
369     * and handling quotes using the specified quote character.
370     *
371     * @param input  the string which is to be parsed, not cloned
372     * @param delim  the field delimiter character
373     * @param quote  the field quoted string character
374     */
375    public StrTokenizer(final char[] input, final char delim, final char quote) {
376        this(input, delim);
377        setQuoteChar(quote);
378    }
379
380    /**
381     * Constructs a tokenizer splitting using the specified delimiter matcher
382     * and handling quotes using the specified quote matcher.
383     *
384     * @param input  the string which is to be parsed, not cloned
385     * @param delim  the field delimiter character
386     * @param quote  the field quoted string character
387     */
388    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
389        this(input, delim);
390        setQuoteMatcher(quote);
391    }
392
393    // API
394    //-----------------------------------------------------------------------
395    /**
396     * Gets the number of tokens found in the String.
397     *
398     * @return the number of matched tokens
399     */
400    public int size() {
401        checkTokenized();
402        return tokens.length;
403    }
404
405    /**
406     * Gets the next token from the String.
407     * Equivalent to {@link #next()} except it returns null rather than
408     * throwing {@link NoSuchElementException} when no tokens remain.
409     *
410     * @return the next sequential token, or null when no more tokens are found
411     */
412    public String nextToken() {
413        if (hasNext()) {
414            return tokens[tokenPos++];
415        }
416        return null;
417    }
418
419    /**
420     * Gets the previous token from the String.
421     *
422     * @return the previous sequential token, or null when no more tokens are found
423     */
424    public String previousToken() {
425        if (hasPrevious()) {
426            return tokens[--tokenPos];
427        }
428        return null;
429    }
430
431    /**
432     * Gets a copy of the full token list as an independent modifiable array.
433     *
434     * @return the tokens as a String array
435     */
436    public String[] getTokenArray() {
437        checkTokenized();
438        return tokens.clone();
439    }
440
441    /**
442     * Gets a copy of the full token list as an independent modifiable list.
443     *
444     * @return the tokens as a String array
445     */
446    public List<String> getTokenList() {
447        checkTokenized();
448        final List<String> list = new ArrayList<>(tokens.length);
449        list.addAll(Arrays.asList(tokens));
450        return list;
451    }
452
453    /**
454     * Resets this tokenizer, forgetting all parsing and iteration already completed.
455     * <p>
456     * This method allows the same tokenizer to be reused for the same String.
457     *
458     * @return this, to enable chaining
459     */
460    public StrTokenizer reset() {
461        tokenPos = 0;
462        tokens = null;
463        return this;
464    }
465
466    /**
467     * Reset this tokenizer, giving it a new input string to parse.
468     * In this manner you can re-use a tokenizer with the same settings
469     * on multiple input lines.
470     *
471     * @param input  the new string to tokenize, null sets no text to parse
472     * @return this, to enable chaining
473     */
474    public StrTokenizer reset(final String input) {
475        reset();
476        if (input != null) {
477            this.chars = input.toCharArray();
478        } else {
479            this.chars = null;
480        }
481        return this;
482    }
483
484    /**
485     * Reset this tokenizer, giving it a new input string to parse.
486     * In this manner you can re-use a tokenizer with the same settings
487     * on multiple input lines.
488     *
489     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
490     * @return this, to enable chaining
491     */
492    public StrTokenizer reset(final char[] input) {
493        reset();
494        this.chars = ArrayUtils.clone(input);
495        return this;
496    }
497
498    // ListIterator
499    //-----------------------------------------------------------------------
500    /**
501     * Checks whether there are any more tokens.
502     *
503     * @return true if there are more tokens
504     */
505    @Override
506    public boolean hasNext() {
507        checkTokenized();
508        return tokenPos < tokens.length;
509    }
510
511    /**
512     * Gets the next token.
513     *
514     * @return the next String token
515     * @throws NoSuchElementException if there are no more elements
516     */
517    @Override
518    public String next() {
519        if (hasNext()) {
520            return tokens[tokenPos++];
521        }
522        throw new NoSuchElementException();
523    }
524
525    /**
526     * Gets the index of the next token to return.
527     *
528     * @return the next token index
529     */
530    @Override
531    public int nextIndex() {
532        return tokenPos;
533    }
534
535    /**
536     * Checks whether there are any previous tokens that can be iterated to.
537     *
538     * @return true if there are previous tokens
539     */
540    @Override
541    public boolean hasPrevious() {
542        checkTokenized();
543        return tokenPos > 0;
544    }
545
546    /**
547     * Gets the token previous to the last returned token.
548     *
549     * @return the previous token
550     */
551    @Override
552    public String previous() {
553        if (hasPrevious()) {
554            return tokens[--tokenPos];
555        }
556        throw new NoSuchElementException();
557    }
558
559    /**
560     * Gets the index of the previous token.
561     *
562     * @return the previous token index
563     */
564    @Override
565    public int previousIndex() {
566        return tokenPos - 1;
567    }
568
569    /**
570     * Unsupported ListIterator operation.
571     *
572     * @throws UnsupportedOperationException always
573     */
574    @Override
575    public void remove() {
576        throw new UnsupportedOperationException("remove() is unsupported");
577    }
578
579    /**
580     * Unsupported ListIterator operation.
581     * @param obj this parameter ignored.
582     * @throws UnsupportedOperationException always
583     */
584    @Override
585    public void set(final String obj) {
586        throw new UnsupportedOperationException("set() is unsupported");
587    }
588
589    /**
590     * Unsupported ListIterator operation.
591     * @param obj this parameter ignored.
592     * @throws UnsupportedOperationException always
593     */
594    @Override
595    public void add(final String obj) {
596        throw new UnsupportedOperationException("add() is unsupported");
597    }
598
599    // Implementation
600    //-----------------------------------------------------------------------
601    /**
602     * Checks if tokenization has been done, and if not then do it.
603     */
604    private void checkTokenized() {
605        if (tokens == null) {
606            if (chars == null) {
607                // still call tokenize as subclass may do some work
608                final List<String> split = tokenize(null, 0, 0);
609                tokens = split.toArray(new String[split.size()]);
610            } else {
611                final List<String> split = tokenize(chars, 0, chars.length);
612                tokens = split.toArray(new String[split.size()]);
613            }
614        }
615    }
616
617    /**
618     * Internal method to performs the tokenization.
619     * <p>
620     * Most users of this class do not need to call this method. This method
621     * will be called automatically by other (public) methods when required.
622     * <p>
623     * This method exists to allow subclasses to add code before or after the
624     * tokenization. For example, a subclass could alter the character array,
625     * offset or count to be parsed, or call the tokenizer multiple times on
626     * multiple strings. It is also be possible to filter the results.
627     * <p>
628     * <code>StrTokenizer</code> will always pass a zero offset and a count
629     * equal to the length of the array to this method, however a subclass
630     * may pass other values, or even an entirely different array.
631     *
632     * @param srcChars  the character array being tokenized, may be null
633     * @param offset  the start position within the character array, must be valid
634     * @param count  the number of characters to tokenize, must be valid
635     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
636     */
637    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
638        if (srcChars == null || count == 0) {
639            return Collections.emptyList();
640        }
641        final StrBuilder buf = new StrBuilder();
642        final List<String> tokenList = new ArrayList<>();
643        int pos = offset;
644
645        // loop around the entire buffer
646        while (pos >= 0 && pos < count) {
647            // find next token
648            pos = readNextToken(srcChars, pos, count, buf, tokenList);
649
650            // handle case where end of string is a delimiter
651            if (pos >= count) {
652                addToken(tokenList, StringUtils.EMPTY);
653            }
654        }
655        return tokenList;
656    }
657
658    /**
659     * Adds a token to a list, paying attention to the parameters we've set.
660     *
661     * @param list  the list to add to
662     * @param tok  the token to add
663     */
664    private void addToken(final List<String> list, String tok) {
665        if (StringUtils.isEmpty(tok)) {
666            if (isIgnoreEmptyTokens()) {
667                return;
668            }
669            if (isEmptyTokenAsNull()) {
670                tok = null;
671            }
672        }
673        list.add(tok);
674    }
675
676    /**
677     * Reads character by character through the String to get the next token.
678     *
679     * @param srcChars  the character array being tokenized
680     * @param start  the first character of field
681     * @param len  the length of the character array being tokenized
682     * @param workArea  a temporary work area
683     * @param tokenList  the list of parsed tokens
684     * @return the starting position of the next field (the character
685     *  immediately after the delimiter), or -1 if end of string found
686     */
687    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
688        // skip all leading whitespace, unless it is the
689        // field delimiter or the quote character
690        while (start < len) {
691            final int removeLen = Math.max(
692                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
693                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
694            if (removeLen == 0 ||
695                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
696                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
697                break;
698            }
699            start += removeLen;
700        }
701
702        // handle reaching end
703        if (start >= len) {
704            addToken(tokenList, StringUtils.EMPTY);
705            return -1;
706        }
707
708        // handle empty token
709        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
710        if (delimLen > 0) {
711            addToken(tokenList, StringUtils.EMPTY);
712            return start + delimLen;
713        }
714
715        // handle found token
716        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
717        if (quoteLen > 0) {
718            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
719        }
720        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
721    }
722
723    /**
724     * Reads a possibly quoted string token.
725     *
726     * @param srcChars  the character array being tokenized
727     * @param start  the first character of field
728     * @param len  the length of the character array being tokenized
729     * @param workArea  a temporary work area
730     * @param tokenList  the list of parsed tokens
731     * @param quoteStart  the start position of the matched quote, 0 if no quoting
732     * @param quoteLen  the length of the matched quote, 0 if no quoting
733     * @return the starting position of the next field (the character
734     *  immediately after the delimiter, or if end of string found,
735     *  then the length of string
736     */
737    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
738                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
739        // Loop until we've found the end of the quoted
740        // string or the end of the input
741        workArea.clear();
742        int pos = start;
743        boolean quoting = quoteLen > 0;
744        int trimStart = 0;
745
746        while (pos < len) {
747            // quoting mode can occur several times throughout a string
748            // we must switch between quoting and non-quoting until we
749            // encounter a non-quoted delimiter, or end of string
750            if (quoting) {
751                // In quoting mode
752
753                // If we've found a quote character, see if it's
754                // followed by a second quote.  If so, then we need
755                // to actually put the quote character into the token
756                // rather than end the token.
757                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
758                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
759                        // matched pair of quotes, thus an escaped quote
760                        workArea.append(srcChars, pos, quoteLen);
761                        pos += quoteLen * 2;
762                        trimStart = workArea.size();
763                        continue;
764                    }
765
766                    // end of quoting
767                    quoting = false;
768                    pos += quoteLen;
769                    continue;
770                }
771
772                // copy regular character from inside quotes
773                workArea.append(srcChars[pos++]);
774                trimStart = workArea.size();
775
776            } else {
777                // Not in quoting mode
778
779                // check for delimiter, and thus end of token
780                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
781                if (delimLen > 0) {
782                    // return condition when end of token found
783                    addToken(tokenList, workArea.substring(0, trimStart));
784                    return pos + delimLen;
785                }
786
787                // check for quote, and thus back into quoting mode
788                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
789                    quoting = true;
790                    pos += quoteLen;
791                    continue;
792                }
793
794                // check for ignored (outside quotes), and ignore
795                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
796                if (ignoredLen > 0) {
797                    pos += ignoredLen;
798                    continue;
799                }
800
801                // check for trimmed character
802                // don't yet know if its at the end, so copy to workArea
803                // use trimStart to keep track of trim at the end
804                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
805                if (trimmedLen > 0) {
806                    workArea.append(srcChars, pos, trimmedLen);
807                    pos += trimmedLen;
808                    continue;
809                }
810
811                // copy regular character from outside quotes
812                workArea.append(srcChars[pos++]);
813                trimStart = workArea.size();
814            }
815        }
816
817        // return condition when end of string found
818        addToken(tokenList, workArea.substring(0, trimStart));
819        return -1;
820    }
821
822    /**
823     * Checks if the characters at the index specified match the quote
824     * already matched in readNextToken().
825     *
826     * @param srcChars  the character array being tokenized
827     * @param pos  the position to check for a quote
828     * @param len  the length of the character array being tokenized
829     * @param quoteStart  the start position of the matched quote, 0 if no quoting
830     * @param quoteLen  the length of the matched quote, 0 if no quoting
831     * @return true if a quote is matched
832     */
833    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
834        for (int i = 0; i < quoteLen; i++) {
835            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
836                return false;
837            }
838        }
839        return true;
840    }
841
842    // Delimiter
843    //-----------------------------------------------------------------------
844    /**
845     * Gets the field delimiter matcher.
846     *
847     * @return the delimiter matcher in use
848     */
849    public StrMatcher getDelimiterMatcher() {
850        return this.delimMatcher;
851    }
852
853    /**
854     * Sets the field delimiter matcher.
855     * <p>
856     * The delimiter is used to separate one token from another.
857     *
858     * @param delim  the delimiter matcher to use
859     * @return this, to enable chaining
860     */
861    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
862        if (delim == null) {
863            this.delimMatcher = StrMatcher.noneMatcher();
864        } else {
865            this.delimMatcher = delim;
866        }
867        return this;
868    }
869
870    /**
871     * Sets the field delimiter character.
872     *
873     * @param delim  the delimiter character to use
874     * @return this, to enable chaining
875     */
876    public StrTokenizer setDelimiterChar(final char delim) {
877        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
878    }
879
880    /**
881     * Sets the field delimiter string.
882     *
883     * @param delim  the delimiter string to use
884     * @return this, to enable chaining
885     */
886    public StrTokenizer setDelimiterString(final String delim) {
887        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
888    }
889
890    // Quote
891    //-----------------------------------------------------------------------
892    /**
893     * Gets the quote matcher currently in use.
894     * <p>
895     * The quote character is used to wrap data between the tokens.
896     * This enables delimiters to be entered as data.
897     * The default value is '"' (double quote).
898     *
899     * @return the quote matcher in use
900     */
901    public StrMatcher getQuoteMatcher() {
902        return quoteMatcher;
903    }
904
905    /**
906     * Set the quote matcher to use.
907     * <p>
908     * The quote character is used to wrap data between the tokens.
909     * This enables delimiters to be entered as data.
910     *
911     * @param quote  the quote matcher to use, null ignored
912     * @return this, to enable chaining
913     */
914    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
915        if (quote != null) {
916            this.quoteMatcher = quote;
917        }
918        return this;
919    }
920
921    /**
922     * Sets the quote character to use.
923     * <p>
924     * The quote character is used to wrap data between the tokens.
925     * This enables delimiters to be entered as data.
926     *
927     * @param quote  the quote character to use
928     * @return this, to enable chaining
929     */
930    public StrTokenizer setQuoteChar(final char quote) {
931        return setQuoteMatcher(StrMatcher.charMatcher(quote));
932    }
933
934    // Ignored
935    //-----------------------------------------------------------------------
936    /**
937     * Gets the ignored character matcher.
938     * <p>
939     * These characters are ignored when parsing the String, unless they are
940     * within a quoted region.
941     * The default value is not to ignore anything.
942     *
943     * @return the ignored matcher in use
944     */
945    public StrMatcher getIgnoredMatcher() {
946        return ignoredMatcher;
947    }
948
949    /**
950     * Set the matcher for characters to ignore.
951     * <p>
952     * These characters are ignored when parsing the String, unless they are
953     * within a quoted region.
954     *
955     * @param ignored  the ignored matcher to use, null ignored
956     * @return this, to enable chaining
957     */
958    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
959        if (ignored != null) {
960            this.ignoredMatcher = ignored;
961        }
962        return this;
963    }
964
965    /**
966     * Set the character to ignore.
967     * <p>
968     * This character is ignored when parsing the String, unless it is
969     * within a quoted region.
970     *
971     * @param ignored  the ignored character to use
972     * @return this, to enable chaining
973     */
974    public StrTokenizer setIgnoredChar(final char ignored) {
975        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
976    }
977
978    // Trimmer
979    //-----------------------------------------------------------------------
980    /**
981     * Gets the trimmer character matcher.
982     * <p>
983     * These characters are trimmed off on each side of the delimiter
984     * until the token or quote is found.
985     * The default value is not to trim anything.
986     *
987     * @return the trimmer matcher in use
988     */
989    public StrMatcher getTrimmerMatcher() {
990        return trimmerMatcher;
991    }
992
993    /**
994     * Sets the matcher for characters to trim.
995     * <p>
996     * These characters are trimmed off on each side of the delimiter
997     * until the token or quote is found.
998     *
999     * @param trimmer  the trimmer matcher to use, null ignored
1000     * @return this, to enable chaining
1001     */
1002    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1003        if (trimmer != null) {
1004            this.trimmerMatcher = trimmer;
1005        }
1006        return this;
1007    }
1008
1009    //-----------------------------------------------------------------------
1010    /**
1011     * Gets whether the tokenizer currently returns empty tokens as null.
1012     * The default for this property is false.
1013     *
1014     * @return true if empty tokens are returned as null
1015     */
1016    public boolean isEmptyTokenAsNull() {
1017        return this.emptyAsNull;
1018    }
1019
1020    /**
1021     * Sets whether the tokenizer should return empty tokens as null.
1022     * The default for this property is false.
1023     *
1024     * @param emptyAsNull  whether empty tokens are returned as null
1025     * @return this, to enable chaining
1026     */
1027    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1028        this.emptyAsNull = emptyAsNull;
1029        return this;
1030    }
1031
1032    //-----------------------------------------------------------------------
1033    /**
1034     * Gets whether the tokenizer currently ignores empty tokens.
1035     * The default for this property is true.
1036     *
1037     * @return true if empty tokens are not returned
1038     */
1039    public boolean isIgnoreEmptyTokens() {
1040        return ignoreEmptyTokens;
1041    }
1042
1043    /**
1044     * Sets whether the tokenizer should ignore and not return empty tokens.
1045     * The default for this property is true.
1046     *
1047     * @param ignoreEmptyTokens  whether empty tokens are not returned
1048     * @return this, to enable chaining
1049     */
1050    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1051        this.ignoreEmptyTokens = ignoreEmptyTokens;
1052        return this;
1053    }
1054
1055    //-----------------------------------------------------------------------
1056    /**
1057     * Gets the String content that the tokenizer is parsing.
1058     *
1059     * @return the string content being parsed
1060     */
1061    public String getContent() {
1062        if (chars == null) {
1063            return null;
1064        }
1065        return new String(chars);
1066    }
1067
1068    //-----------------------------------------------------------------------
1069    /**
1070     * Creates a new instance of this Tokenizer. The new instance is reset so
1071     * that it will be at the start of the token list.
1072     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1073     *
1074     * @return a new instance of this Tokenizer which has been reset.
1075     */
1076    @Override
1077    public Object clone() {
1078        try {
1079            return cloneReset();
1080        } catch (final CloneNotSupportedException ex) {
1081            return null;
1082        }
1083    }
1084
1085    /**
1086     * Creates a new instance of this Tokenizer. The new instance is reset so that
1087     * it will be at the start of the token list.
1088     *
1089     * @return a new instance of this Tokenizer which has been reset.
1090     * @throws CloneNotSupportedException if there is a problem cloning
1091     */
1092    Object cloneReset() throws CloneNotSupportedException {
1093        // this method exists to enable 100% test coverage
1094        final StrTokenizer cloned = (StrTokenizer) super.clone();
1095        if (cloned.chars != null) {
1096            cloned.chars = cloned.chars.clone();
1097        }
1098        cloned.reset();
1099        return cloned;
1100    }
1101
1102    //-----------------------------------------------------------------------
1103    /**
1104     * Gets the String content that the tokenizer is parsing.
1105     *
1106     * @return the string content being parsed
1107     */
1108    @Override
1109    public String toString() {
1110        if (tokens == null) {
1111            return "StrTokenizer[not tokenized yet]";
1112        }
1113        return "StrTokenizer" + getTokenList();
1114    }
1115
1116}