Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025/**
026 * Tokenizes a string based based on delimiters (separators)
027 * and supporting quoting and ignored character concepts.
028 * <p>
029 * This class can split a String into many smaller strings. It aims
030 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
031 * however it offers much more control and flexibility including implementing
032 * the <code>ListIterator</code> interface. By default, it is set up
033 * like <code>StringTokenizer</code>.
034 * <p>
035 * The input String is split into a number of <i>tokens</i>.
036 * Each token is separated from the next String by a <i>delimiter</i>.
037 * One or more delimiter characters must be specified.
038 * <p>
039 * Each token may be surrounded by quotes.
040 * The <i>quote</i> matcher specifies the quote character(s).
041 * A quote may be escaped within a quoted section by duplicating itself.
042 * <p>
043 * Between each token and the delimiter are potentially characters that need trimming.
044 * The <i>trimmer</i> matcher specifies these characters.
045 * One usage might be to trim whitespace characters.
046 * <p>
047 * At any point outside the quotes there might potentially be invalid characters.
048 * The <i>ignored</i> matcher specifies these characters to be removed.
049 * One usage might be to remove new line characters.
050 * <p>
051 * Empty tokens may be removed or returned as null.
052 * <pre>
053 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056 * </pre>
057 * <p>
058 *
059 * This tokenizer has the following properties and options:
060 *
061 * <table summary="Tokenizer Properties">
062 *  <tr>
063 *   <th>Property</th><th>Type</th><th>Default</th>
064 *  </tr>
065 *  <tr>
066 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
067 *  </tr>
068 *  <tr>
069 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
070 *  </tr>
071 *  <tr>
072 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
073 *  </tr>
074 *  <tr>
075 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
076 *  </tr>
077 *  <tr>
078 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
079 *  </tr>
080 * </table>
081 *
082 * @since 1.0
083 */
084public class StrTokenizer implements ListIterator<String>, Cloneable {
085
086    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
087    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
088    static {
089        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
090        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
091        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
092        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
093        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
094        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
095        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
096
097        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
098        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
099        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
100        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
101        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
102        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
103        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
104    }
105
106    /** The text to work on. */
107    private char chars[];
108    /** The parsed tokens */
109    private String tokens[];
110    /** The current iteration position */
111    private int tokenPos;
112
113    /** The delimiter matcher */
114    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
115    /** The quote matcher */
116    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
117    /** The ignored matcher */
118    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
119    /** The trimmer matcher */
120    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
121
122    /** Whether to return empty tokens as null */
123    private boolean emptyAsNull = false;
124    /** Whether to ignore empty tokens */
125    private boolean ignoreEmptyTokens = true;
126
127    //-----------------------------------------------------------------------
128
129    /**
130     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
131     *
132     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
133     */
134    private static StrTokenizer getCSVClone() {
135        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
136    }
137
138    /**
139     * Gets a new tokenizer instance which parses Comma Separated Value strings
140     * initializing it with the given input.  The default for CSV processing
141     * will be trim whitespace from both ends (which can be overridden with
142     * the setTrimmer method).
143     * <p>
144     * You must call a "reset" method to set the string which you want to parse.
145     * @return a new tokenizer instance which parses Comma Separated Value strings
146     */
147    public static StrTokenizer getCSVInstance() {
148        return getCSVClone();
149    }
150
151    /**
152     * Gets a new tokenizer instance which parses Comma Separated Value strings
153     * initializing it with the given input.  The default for CSV processing
154     * will be trim whitespace from both ends (which can be overridden with
155     * the setTrimmer method).
156     *
157     * @param input  the text to parse
158     * @return a new tokenizer instance which parses Comma Separated Value strings
159     */
160    public static StrTokenizer getCSVInstance(final String input) {
161        final StrTokenizer tok = getCSVClone();
162        tok.reset(input);
163        return tok;
164    }
165
166    /**
167     * Gets a new tokenizer instance which parses Comma Separated Value strings
168     * initializing it with the given input.  The default for CSV processing
169     * will be trim whitespace from both ends (which can be overridden with
170     * the setTrimmer method).
171     *
172     * @param input  the text to parse
173     * @return a new tokenizer instance which parses Comma Separated Value strings
174     */
175    public static StrTokenizer getCSVInstance(final char[] input) {
176        final StrTokenizer tok = getCSVClone();
177        tok.reset(input);
178        return tok;
179    }
180
181    /**
182     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
183     *
184     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
185     */
186    private static StrTokenizer getTSVClone() {
187        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
188    }
189
190
191    /**
192     * Gets a new tokenizer instance which parses Tab Separated Value strings.
193     * The default for CSV processing will be trim whitespace from both ends
194     * (which can be overridden with the setTrimmer method).
195     * <p>
196     * You must call a "reset" method to set the string which you want to parse.
197     * @return a new tokenizer instance which parses Tab Separated Value strings.
198     */
199    public static StrTokenizer getTSVInstance() {
200        return getTSVClone();
201    }
202
203    /**
204     * Gets a new tokenizer instance which parses Tab Separated Value strings.
205     * The default for CSV processing will be trim whitespace from both ends
206     * (which can be overridden with the setTrimmer method).
207     * @param input  the string to parse
208     * @return a new tokenizer instance which parses Tab Separated Value strings.
209     */
210    public static StrTokenizer getTSVInstance(final String input) {
211        final StrTokenizer tok = getTSVClone();
212        tok.reset(input);
213        return tok;
214    }
215
216    /**
217     * Gets a new tokenizer instance which parses Tab Separated Value strings.
218     * The default for CSV processing will be trim whitespace from both ends
219     * (which can be overridden with the setTrimmer method).
220     * @param input  the string to parse
221     * @return a new tokenizer instance which parses Tab Separated Value strings.
222     */
223    public static StrTokenizer getTSVInstance(final char[] input) {
224        final StrTokenizer tok = getTSVClone();
225        tok.reset(input);
226        return tok;
227    }
228
229    //-----------------------------------------------------------------------
230    /**
231     * Constructs a tokenizer splitting on space, tab, newline and formfeed
232     * as per StringTokenizer, but with no text to tokenize.
233     * <p>
234     * This constructor is normally used with {@link #reset(String)}.
235     */
236    public StrTokenizer() {
237        super();
238        this.chars = null;
239    }
240
241    /**
242     * Constructs a tokenizer splitting on space, tab, newline and formfeed
243     * as per StringTokenizer.
244     *
245     * @param input  the string which is to be parsed
246     */
247    public StrTokenizer(final String input) {
248        super();
249        if (input != null) {
250            chars = input.toCharArray();
251        } else {
252            chars = null;
253        }
254    }
255
256    /**
257     * Constructs a tokenizer splitting on the specified delimiter character.
258     *
259     * @param input  the string which is to be parsed
260     * @param delim  the field delimiter character
261     */
262    public StrTokenizer(final String input, final char delim) {
263        this(input);
264        setDelimiterChar(delim);
265    }
266
267    /**
268     * Constructs a tokenizer splitting on the specified delimiter string.
269     *
270     * @param input  the string which is to be parsed
271     * @param delim  the field delimiter string
272     */
273    public StrTokenizer(final String input, final String delim) {
274        this(input);
275        setDelimiterString(delim);
276    }
277
278    /**
279     * Constructs a tokenizer splitting using the specified delimiter matcher.
280     *
281     * @param input  the string which is to be parsed
282     * @param delim  the field delimiter matcher
283     */
284    public StrTokenizer(final String input, final StrMatcher delim) {
285        this(input);
286        setDelimiterMatcher(delim);
287    }
288
289    /**
290     * Constructs a tokenizer splitting on the specified delimiter character
291     * and handling quotes using the specified quote character.
292     *
293     * @param input  the string which is to be parsed
294     * @param delim  the field delimiter character
295     * @param quote  the field quoted string character
296     */
297    public StrTokenizer(final String input, final char delim, final char quote) {
298        this(input, delim);
299        setQuoteChar(quote);
300    }
301
302    /**
303     * Constructs a tokenizer splitting using the specified delimiter matcher
304     * and handling quotes using the specified quote matcher.
305     *
306     * @param input  the string which is to be parsed
307     * @param delim  the field delimiter matcher
308     * @param quote  the field quoted string matcher
309     */
310    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
311        this(input, delim);
312        setQuoteMatcher(quote);
313    }
314
315    /**
316     * Constructs a tokenizer splitting on space, tab, newline and formfeed
317     * as per StringTokenizer.
318     *
319     * @param input  the string which is to be parsed, not cloned
320     */
321    public StrTokenizer(final char[] input) {
322        super();
323        if (input == null) {
324            this.chars = null;
325        } else {
326            this.chars = input.clone();
327        }
328    }
329
330    /**
331     * Constructs a tokenizer splitting on the specified character.
332     *
333     * @param input  the string which is to be parsed, not cloned
334     * @param delim the field delimiter character
335     */
336    public StrTokenizer(final char[] input, final char delim) {
337        this(input);
338        setDelimiterChar(delim);
339    }
340
341    /**
342     * Constructs a tokenizer splitting on the specified string.
343     *
344     * @param input  the string which is to be parsed, not cloned
345     * @param delim the field delimiter string
346     */
347    public StrTokenizer(final char[] input, final String delim) {
348        this(input);
349        setDelimiterString(delim);
350    }
351
352    /**
353     * Constructs a tokenizer splitting using the specified delimiter matcher.
354     *
355     * @param input  the string which is to be parsed, not cloned
356     * @param delim  the field delimiter matcher
357     */
358    public StrTokenizer(final char[] input, final StrMatcher delim) {
359        this(input);
360        setDelimiterMatcher(delim);
361    }
362
363    /**
364     * Constructs a tokenizer splitting on the specified delimiter character
365     * and handling quotes using the specified quote character.
366     *
367     * @param input  the string which is to be parsed, not cloned
368     * @param delim  the field delimiter character
369     * @param quote  the field quoted string character
370     */
371    public StrTokenizer(final char[] input, final char delim, final char quote) {
372        this(input, delim);
373        setQuoteChar(quote);
374    }
375
376    /**
377     * Constructs a tokenizer splitting using the specified delimiter matcher
378     * and handling quotes using the specified quote matcher.
379     *
380     * @param input  the string which is to be parsed, not cloned
381     * @param delim  the field delimiter character
382     * @param quote  the field quoted string character
383     */
384    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
385        this(input, delim);
386        setQuoteMatcher(quote);
387    }
388
389    // API
390    //-----------------------------------------------------------------------
391    /**
392     * Gets the number of tokens found in the String.
393     *
394     * @return the number of matched tokens
395     */
396    public int size() {
397        checkTokenized();
398        return tokens.length;
399    }
400
401    /**
402     * Gets the next token from the String.
403     * Equivalent to {@link #next()} except it returns null rather than
404     * throwing {@link NoSuchElementException} when no tokens remain.
405     *
406     * @return the next sequential token, or null when no more tokens are found
407     */
408    public String nextToken() {
409        if (hasNext()) {
410            return tokens[tokenPos++];
411        }
412        return null;
413    }
414
415    /**
416     * Gets the previous token from the String.
417     *
418     * @return the previous sequential token, or null when no more tokens are found
419     */
420    public String previousToken() {
421        if (hasPrevious()) {
422            return tokens[--tokenPos];
423        }
424        return null;
425    }
426
427    /**
428     * Gets a copy of the full token list as an independent modifiable array.
429     *
430     * @return the tokens as a String array
431     */
432    public String[] getTokenArray() {
433        checkTokenized();
434        return tokens.clone();
435    }
436
437    /**
438     * Gets a copy of the full token list as an independent modifiable list.
439     *
440     * @return the tokens as a String array
441     */
442    public List<String> getTokenList() {
443        checkTokenized();
444        final List<String> list = new ArrayList<>(tokens.length);
445        for (final String element : tokens) {
446            list.add(element);
447        }
448        return list;
449    }
450
451    /**
452     * Resets this tokenizer, forgetting all parsing and iteration already completed.
453     * <p>
454     * This method allows the same tokenizer to be reused for the same String.
455     *
456     * @return this, to enable chaining
457     */
458    public org.apache.commons.text.StrTokenizer reset() {
459        tokenPos = 0;
460        tokens = null;
461        return this;
462    }
463
464    /**
465     * Reset this tokenizer, giving it a new input string to parse.
466     * In this manner you can re-use a tokenizer with the same settings
467     * on multiple input lines.
468     *
469     * @param input  the new string to tokenize, null sets no text to parse
470     * @return this, to enable chaining
471     */
472    public org.apache.commons.text.StrTokenizer reset(final String input) {
473        reset();
474        if (input != null) {
475            this.chars = input.toCharArray();
476        } else {
477            this.chars = null;
478        }
479        return this;
480    }
481
482    /**
483     * Reset this tokenizer, giving it a new input string to parse.
484     * In this manner you can re-use a tokenizer with the same settings
485     * on multiple input lines.
486     *
487     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
488     * @return this, to enable chaining
489     */
490    public org.apache.commons.text.StrTokenizer reset(final char[] input) {
491        reset();
492        if (input != null) {
493            this.chars = input;
494        } else {
495            this.chars = null;
496        }
497        return this;
498    }
499
500    // ListIterator
501    //-----------------------------------------------------------------------
502    /**
503     * Checks whether there are any more tokens.
504     *
505     * @return true if there are more tokens
506     */
507    @Override
508    public boolean hasNext() {
509        checkTokenized();
510        return tokenPos < tokens.length;
511    }
512
513    /**
514     * Gets the next token.
515     *
516     * @return the next String token
517     * @throws NoSuchElementException if there are no more elements
518     */
519    @Override
520    public String next() {
521        if (hasNext()) {
522            return tokens[tokenPos++];
523        }
524        throw new NoSuchElementException();
525    }
526
527    /**
528     * Gets the index of the next token to return.
529     *
530     * @return the next token index
531     */
532    @Override
533    public int nextIndex() {
534        return tokenPos;
535    }
536
537    /**
538     * Checks whether there are any previous tokens that can be iterated to.
539     *
540     * @return true if there are previous tokens
541     */
542    @Override
543    public boolean hasPrevious() {
544        checkTokenized();
545        return tokenPos > 0;
546    }
547
548    /**
549     * Gets the token previous to the last returned token.
550     *
551     * @return the previous token
552     */
553    @Override
554    public String previous() {
555        if (hasPrevious()) {
556            return tokens[--tokenPos];
557        }
558        throw new NoSuchElementException();
559    }
560
561    /**
562     * Gets the index of the previous token.
563     *
564     * @return the previous token index
565     */
566    @Override
567    public int previousIndex() {
568        return tokenPos - 1;
569    }
570
571    /**
572     * Unsupported ListIterator operation.
573     *
574     * @throws UnsupportedOperationException always
575     */
576    @Override
577    public void remove() {
578        throw new UnsupportedOperationException("remove() is unsupported");
579    }
580
581    /**
582     * Unsupported ListIterator operation.
583     * @param obj this parameter ignored.
584     * @throws UnsupportedOperationException always
585     */
586    @Override
587    public void set(final String obj) {
588        throw new UnsupportedOperationException("set() is unsupported");
589    }
590
591    /**
592     * Unsupported ListIterator operation.
593     * @param obj this parameter ignored.
594     * @throws UnsupportedOperationException always
595     */
596    @Override
597    public void add(final String obj) {
598        throw new UnsupportedOperationException("add() is unsupported");
599    }
600
601    // Implementation
602    //-----------------------------------------------------------------------
603    /**
604     * Checks if tokenization has been done, and if not then do it.
605     */
606    private void checkTokenized() {
607        if (tokens == null) {
608            if (chars == null) {
609                // still call tokenize as subclass may do some work
610                final List<String> split = tokenize(null, 0, 0);
611                tokens = split.toArray(new String[split.size()]);
612            } else {
613                final List<String> split = tokenize(chars, 0, chars.length);
614                tokens = split.toArray(new String[split.size()]);
615            }
616        }
617    }
618
619    /**
620     * Internal method to performs the tokenization.
621     * <p>
622     * Most users of this class do not need to call this method. This method
623     * will be called automatically by other (public) methods when required.
624     * <p>
625     * This method exists to allow subclasses to add code before or after the
626     * tokenization. For example, a subclass could alter the character array,
627     * offset or count to be parsed, or call the tokenizer multiple times on
628     * multiple strings. It is also be possible to filter the results.
629     * <p>
630     * <code>StrTokenizer</code> will always pass a zero offset and a count
631     * equal to the length of the array to this method, however a subclass
632     * may pass other values, or even an entirely different array.
633     *
634     * @param srcChars  the character array being tokenized, may be null
635     * @param offset  the start position within the character array, must be valid
636     * @param count  the number of characters to tokenize, must be valid
637     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
638     */
639    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
640        if (srcChars == null || count == 0) {
641            return Collections.emptyList();
642        }
643        final StrBuilder buf = new StrBuilder();
644        final List<String> tokenList = new ArrayList<>();
645        int pos = offset;
646
647        // loop around the entire buffer
648        while (pos >= 0 && pos < count) {
649            // find next token
650            pos = readNextToken(srcChars, pos, count, buf, tokenList);
651
652            // handle case where end of string is a delimiter
653            if (pos >= count) {
654                addToken(tokenList, "");
655            }
656        }
657        return tokenList;
658    }
659
660    /**
661     * Adds a token to a list, paying attention to the parameters we've set.
662     *
663     * @param list  the list to add to
664     * @param tok  the token to add
665     */
666    private void addToken(final List<String> list, String tok) {
667        if (tok == null || tok.length() == 0) {
668            if (isIgnoreEmptyTokens()) {
669                return;
670            }
671            if (isEmptyTokenAsNull()) {
672                tok = null;
673            }
674        }
675        list.add(tok);
676    }
677
678    /**
679     * Reads character by character through the String to get the next token.
680     *
681     * @param srcChars  the character array being tokenized
682     * @param start  the first character of field
683     * @param len  the length of the character array being tokenized
684     * @param workArea  a temporary work area
685     * @param tokenList  the list of parsed tokens
686     * @return the starting position of the next field (the character
687     *  immediately after the delimiter), or -1 if end of string found
688     */
689    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
690        // skip all leading whitespace, unless it is the
691        // field delimiter or the quote character
692        while (start < len) {
693            final int removeLen = Math.max(
694                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
695                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
696            if (removeLen == 0 ||
697                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
698                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
699                break;
700            }
701            start += removeLen;
702        }
703
704        // handle reaching end
705        if (start >= len) {
706            addToken(tokenList, "");
707            return -1;
708        }
709
710        // handle empty token
711        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
712        if (delimLen > 0) {
713            addToken(tokenList, "");
714            return start + delimLen;
715        }
716
717        // handle found token
718        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
719        if (quoteLen > 0) {
720            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
721        }
722        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
723    }
724
725    /**
726     * Reads a possibly quoted string token.
727     *
728     * @param srcChars  the character array being tokenized
729     * @param start  the first character of field
730     * @param len  the length of the character array being tokenized
731     * @param workArea  a temporary work area
732     * @param tokenList  the list of parsed tokens
733     * @param quoteStart  the start position of the matched quote, 0 if no quoting
734     * @param quoteLen  the length of the matched quote, 0 if no quoting
735     * @return the starting position of the next field (the character
736     *  immediately after the delimiter, or if end of string found,
737     *  then the length of string
738     */
739    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
740                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
741        // Loop until we've found the end of the quoted
742        // string or the end of the input
743        workArea.clear();
744        int pos = start;
745        boolean quoting = quoteLen > 0;
746        int trimStart = 0;
747
748        while (pos < len) {
749            // quoting mode can occur several times throughout a string
750            // we must switch between quoting and non-quoting until we
751            // encounter a non-quoted delimiter, or end of string
752            if (quoting) {
753                // In quoting mode
754
755                // If we've found a quote character, see if it's
756                // followed by a second quote.  If so, then we need
757                // to actually put the quote character into the token
758                // rather than end the token.
759                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
760                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
761                        // matched pair of quotes, thus an escaped quote
762                        workArea.append(srcChars, pos, quoteLen);
763                        pos += quoteLen * 2;
764                        trimStart = workArea.size();
765                        continue;
766                    }
767
768                    // end of quoting
769                    quoting = false;
770                    pos += quoteLen;
771                    continue;
772                }
773
774                // copy regular character from inside quotes
775                workArea.append(srcChars[pos++]);
776                trimStart = workArea.size();
777
778            } else {
779                // Not in quoting mode
780
781                // check for delimiter, and thus end of token
782                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
783                if (delimLen > 0) {
784                    // return condition when end of token found
785                    addToken(tokenList, workArea.substring(0, trimStart));
786                    return pos + delimLen;
787                }
788
789                // check for quote, and thus back into quoting mode
790                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
791                    quoting = true;
792                    pos += quoteLen;
793                    continue;
794                }
795
796                // check for ignored (outside quotes), and ignore
797                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
798                if (ignoredLen > 0) {
799                    pos += ignoredLen;
800                    continue;
801                }
802
803                // check for trimmed character
804                // don't yet know if its at the end, so copy to workArea
805                // use trimStart to keep track of trim at the end
806                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
807                if (trimmedLen > 0) {
808                    workArea.append(srcChars, pos, trimmedLen);
809                    pos += trimmedLen;
810                    continue;
811                }
812
813                // copy regular character from outside quotes
814                workArea.append(srcChars[pos++]);
815                trimStart = workArea.size();
816            }
817        }
818
819        // return condition when end of string found
820        addToken(tokenList, workArea.substring(0, trimStart));
821        return -1;
822    }
823
824    /**
825     * Checks if the characters at the index specified match the quote
826     * already matched in readNextToken().
827     *
828     * @param srcChars  the character array being tokenized
829     * @param pos  the position to check for a quote
830     * @param len  the length of the character array being tokenized
831     * @param quoteStart  the start position of the matched quote, 0 if no quoting
832     * @param quoteLen  the length of the matched quote, 0 if no quoting
833     * @return true if a quote is matched
834     */
835    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
836        for (int i = 0; i < quoteLen; i++) {
837            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
838                return false;
839            }
840        }
841        return true;
842    }
843
844    // Delimiter
845    //-----------------------------------------------------------------------
846    /**
847     * Gets the field delimiter matcher.
848     *
849     * @return the delimiter matcher in use
850     */
851    public StrMatcher getDelimiterMatcher() {
852        return this.delimMatcher;
853    }
854
855    /**
856     * Sets the field delimiter matcher.
857     * <p>
858     * The delimitier is used to separate one token from another.
859     *
860     * @param delim  the delimiter matcher to use
861     * @return this, to enable chaining
862     */
863    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
864        if (delim == null) {
865            this.delimMatcher = StrMatcher.noneMatcher();
866        } else {
867            this.delimMatcher = delim;
868        }
869        return this;
870    }
871
872    /**
873     * Sets the field delimiter character.
874     *
875     * @param delim  the delimiter character to use
876     * @return this, to enable chaining
877     */
878    public StrTokenizer setDelimiterChar(final char delim) {
879        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
880    }
881
882    /**
883     * Sets the field delimiter string.
884     *
885     * @param delim  the delimiter string to use
886     * @return this, to enable chaining
887     */
888    public StrTokenizer setDelimiterString(final String delim) {
889        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
890    }
891
892    // Quote
893    //-----------------------------------------------------------------------
894    /**
895     * Gets the quote matcher currently in use.
896     * <p>
897     * The quote character is used to wrap data between the tokens.
898     * This enables delimiters to be entered as data.
899     * The default value is '"' (double quote).
900     *
901     * @return the quote matcher in use
902     */
903    public StrMatcher getQuoteMatcher() {
904        return quoteMatcher;
905    }
906
907    /**
908     * Set the quote matcher to use.
909     * <p>
910     * The quote character is used to wrap data between the tokens.
911     * This enables delimiters to be entered as data.
912     *
913     * @param quote  the quote matcher to use, null ignored
914     * @return this, to enable chaining
915     */
916    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
917        if (quote != null) {
918            this.quoteMatcher = quote;
919        }
920        return this;
921    }
922
923    /**
924     * Sets the quote character to use.
925     * <p>
926     * The quote character is used to wrap data between the tokens.
927     * This enables delimiters to be entered as data.
928     *
929     * @param quote  the quote character to use
930     * @return this, to enable chaining
931     */
932    public StrTokenizer setQuoteChar(final char quote) {
933        return setQuoteMatcher(StrMatcher.charMatcher(quote));
934    }
935
936    // Ignored
937    //-----------------------------------------------------------------------
938    /**
939     * Gets the ignored character matcher.
940     * <p>
941     * These characters are ignored when parsing the String, unless they are
942     * within a quoted region.
943     * The default value is not to ignore anything.
944     *
945     * @return the ignored matcher in use
946     */
947    public StrMatcher getIgnoredMatcher() {
948        return ignoredMatcher;
949    }
950
951    /**
952     * Set the matcher for characters to ignore.
953     * <p>
954     * These characters are ignored when parsing the String, unless they are
955     * within a quoted region.
956     *
957     * @param ignored  the ignored matcher to use, null ignored
958     * @return this, to enable chaining
959     */
960    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
961        if (ignored != null) {
962            this.ignoredMatcher = ignored;
963        }
964        return this;
965    }
966
967    /**
968     * Set the character to ignore.
969     * <p>
970     * This character is ignored when parsing the String, unless it is
971     * within a quoted region.
972     *
973     * @param ignored  the ignored character to use
974     * @return this, to enable chaining
975     */
976    public StrTokenizer setIgnoredChar(final char ignored) {
977        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
978    }
979
980    // Trimmer
981    //-----------------------------------------------------------------------
982    /**
983     * Gets the trimmer character matcher.
984     * <p>
985     * These characters are trimmed off on each side of the delimiter
986     * until the token or quote is found.
987     * The default value is not to trim anything.
988     *
989     * @return the trimmer matcher in use
990     */
991    public StrMatcher getTrimmerMatcher() {
992        return trimmerMatcher;
993    }
994
995    /**
996     * Sets the matcher for characters to trim.
997     * <p>
998     * These characters are trimmed off on each side of the delimiter
999     * until the token or quote is found.
1000     *
1001     * @param trimmer  the trimmer matcher to use, null ignored
1002     * @return this, to enable chaining
1003     */
1004    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1005        if (trimmer != null) {
1006            this.trimmerMatcher = trimmer;
1007        }
1008        return this;
1009    }
1010
1011    //-----------------------------------------------------------------------
1012    /**
1013     * Gets whether the tokenizer currently returns empty tokens as null.
1014     * The default for this property is false.
1015     *
1016     * @return true if empty tokens are returned as null
1017     */
1018    public boolean isEmptyTokenAsNull() {
1019        return this.emptyAsNull;
1020    }
1021
1022    /**
1023     * Sets whether the tokenizer should return empty tokens as null.
1024     * The default for this property is false.
1025     *
1026     * @param emptyAsNull  whether empty tokens are returned as null
1027     * @return this, to enable chaining
1028     */
1029    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1030        this.emptyAsNull = emptyAsNull;
1031        return this;
1032    }
1033
1034    //-----------------------------------------------------------------------
1035    /**
1036     * Gets whether the tokenizer currently ignores empty tokens.
1037     * The default for this property is true.
1038     *
1039     * @return true if empty tokens are not returned
1040     */
1041    public boolean isIgnoreEmptyTokens() {
1042        return ignoreEmptyTokens;
1043    }
1044
1045    /**
1046     * Sets whether the tokenizer should ignore and not return empty tokens.
1047     * The default for this property is true.
1048     *
1049     * @param ignoreEmptyTokens  whether empty tokens are not returned
1050     * @return this, to enable chaining
1051     */
1052    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1053        this.ignoreEmptyTokens = ignoreEmptyTokens;
1054        return this;
1055    }
1056
1057    //-----------------------------------------------------------------------
1058    /**
1059     * Gets the String content that the tokenizer is parsing.
1060     *
1061     * @return the string content being parsed
1062     */
1063    public String getContent() {
1064        if (chars == null) {
1065            return null;
1066        }
1067        return new String(chars);
1068    }
1069
1070    //-----------------------------------------------------------------------
1071    /**
1072     * Creates a new instance of this Tokenizer. The new instance is reset so
1073     * that it will be at the start of the token list.
1074     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1075     * 
1076     * @return a new instance of this Tokenizer which has been reset.
1077     */
1078    @Override
1079    public Object clone() {
1080        try {
1081            return cloneReset();
1082        } catch (final CloneNotSupportedException ex) {
1083            return null;
1084        }
1085    }
1086
1087    /**
1088     * Creates a new instance of this Tokenizer. The new instance is reset so that
1089     * it will be at the start of the token list.
1090     * 
1091     * @return a new instance of this Tokenizer which has been reset.
1092     * @throws CloneNotSupportedException if there is a problem cloning
1093     */
1094    Object cloneReset() throws CloneNotSupportedException {
1095        // this method exists to enable 100% test coverage
1096        final StrTokenizer cloned = (StrTokenizer) super.clone();
1097        if (cloned.chars != null) {
1098            cloned.chars = cloned.chars.clone();
1099        }
1100        cloned.reset();
1101        return cloned;
1102    }
1103
1104    //-----------------------------------------------------------------------
1105    /**
1106     * Gets the String content that the tokenizer is parsing.
1107     *
1108     * @return the string content being parsed
1109     */
1110    @Override
1111    public String toString() {
1112        if (tokens == null) {
1113            return "StrTokenizer[not tokenized yet]";
1114        }
1115        return "StrTokenizer" + getTokenList();
1116    }
1117
1118}