Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.ArrayUtils;
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * Tokenizes a string based based on delimiters (separators)
030 * and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims
033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
034 * however it offers much more control and flexibility including implementing
035 * the <code>ListIterator</code> interface. By default, it is set up
036 * like <code>StringTokenizer</code>.
037 * <p>
038 * The input String is split into a number of <i>tokens</i>.
039 * Each token is separated from the next String by a <i>delimiter</i>.
040 * One or more delimiter characters must be specified.
041 * <p>
042 * Each token may be surrounded by quotes.
043 * The <i>quote</i> matcher specifies the quote character(s).
044 * A quote may be escaped within a quoted section by duplicating itself.
045 * <p>
046 * Between each token and the delimiter are potentially characters that need trimming.
047 * The <i>trimmer</i> matcher specifies these characters.
048 * One usage might be to trim whitespace characters.
049 * <p>
050 * At any point outside the quotes there might potentially be invalid characters.
051 * The <i>ignored</i> matcher specifies these characters to be removed.
052 * One usage might be to remove new line characters.
053 * <p>
054 * Empty tokens may be removed or returned as null.
055 * <pre>
056 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
057 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
059 * </pre>
060 * <p>
061 *
062 * This tokenizer has the following properties and options:
063 *
064 * <table>
065 *  <tr>
066 *   <th>Property</th><th>Type</th><th>Default</th>
067 *  </tr>
068 *  <tr>
069 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
070 *  </tr>
071 *  <tr>
072 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
073 *  </tr>
074 *  <tr>
075 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
082 *  </tr>
083 * </table>
084 *
085 * @since 2.2
086 * @version $Id: StrTokenizer.java 1533551 2013-10-18 16:49:15Z sebb $
087 */
088public class StrTokenizer implements ListIterator<String>, Cloneable {
089
090    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
091    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
092    static {
093        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
094        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
095        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
096        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
097        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
098        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
099        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100
101        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108    }
109
110    /** The text to work on. */
111    private char chars[];
112    /** The parsed tokens */
113    private String tokens[];
114    /** The current iteration position */
115    private int tokenPos;
116
117    /** The delimiter matcher */
118    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119    /** The quote matcher */
120    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121    /** The ignored matcher */
122    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123    /** The trimmer matcher */
124    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125
126    /** Whether to return empty tokens as null */
127    private boolean emptyAsNull = false;
128    /** Whether to ignore empty tokens */
129    private boolean ignoreEmptyTokens = true;
130
131    //-----------------------------------------------------------------------
132
133    /**
134     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135     * 
136     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137     */
138    private static StrTokenizer getCSVClone() {
139        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140    }
141
142    /**
143     * Gets a new tokenizer instance which parses Comma Separated Value strings
144     * initializing it with the given input.  The default for CSV processing
145     * will be trim whitespace from both ends (which can be overridden with
146     * the setTrimmer method).
147     * <p>
148     * You must call a "reset" method to set the string which you want to parse.
149     * @return a new tokenizer instance which parses Comma Separated Value strings
150     */
151    public static StrTokenizer getCSVInstance() {
152        return getCSVClone();
153    }
154
155    /**
156     * Gets a new tokenizer instance which parses Comma Separated Value strings
157     * initializing it with the given input.  The default for CSV processing
158     * will be trim whitespace from both ends (which can be overridden with
159     * the setTrimmer method).
160     *
161     * @param input  the text to parse
162     * @return a new tokenizer instance which parses Comma Separated Value strings
163     */
164    public static StrTokenizer getCSVInstance(final String input) {
165        final StrTokenizer tok = getCSVClone();
166        tok.reset(input);
167        return tok;
168    }
169
170    /**
171     * Gets a new tokenizer instance which parses Comma Separated Value strings
172     * initializing it with the given input.  The default for CSV processing
173     * will be trim whitespace from both ends (which can be overridden with
174     * the setTrimmer method).
175     *
176     * @param input  the text to parse
177     * @return a new tokenizer instance which parses Comma Separated Value strings
178     */
179    public static StrTokenizer getCSVInstance(final char[] input) {
180        final StrTokenizer tok = getCSVClone();
181        tok.reset(input);
182        return tok;
183    }
184
185    /**
186     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187     * 
188     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189     */
190    private static StrTokenizer getTSVClone() {
191        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192    }
193
194
195    /**
196     * Gets a new tokenizer instance which parses Tab Separated Value strings.
197     * The default for CSV processing will be trim whitespace from both ends
198     * (which can be overridden with the setTrimmer method).
199     * <p>
200     * You must call a "reset" method to set the string which you want to parse.
201     * @return a new tokenizer instance which parses Tab Separated Value strings.
202     */
203    public static StrTokenizer getTSVInstance() {
204        return getTSVClone();
205    }
206
207    /**
208     * Gets a new tokenizer instance which parses Tab Separated Value strings.
209     * The default for CSV processing will be trim whitespace from both ends
210     * (which can be overridden with the setTrimmer method).
211     * @param input  the string to parse
212     * @return a new tokenizer instance which parses Tab Separated Value strings.
213     */
214    public static StrTokenizer getTSVInstance(final String input) {
215        final StrTokenizer tok = getTSVClone();
216        tok.reset(input);
217        return tok;
218    }
219
220    /**
221     * Gets a new tokenizer instance which parses Tab Separated Value strings.
222     * The default for CSV processing will be trim whitespace from both ends
223     * (which can be overridden with the setTrimmer method).
224     * @param input  the string to parse
225     * @return a new tokenizer instance which parses Tab Separated Value strings.
226     */
227    public static StrTokenizer getTSVInstance(final char[] input) {
228        final StrTokenizer tok = getTSVClone();
229        tok.reset(input);
230        return tok;
231    }
232
233    //-----------------------------------------------------------------------
234    /**
235     * Constructs a tokenizer splitting on space, tab, newline and formfeed
236     * as per StringTokenizer, but with no text to tokenize.
237     * <p>
238     * This constructor is normally used with {@link #reset(String)}.
239     */
240    public StrTokenizer() {
241        super();
242        this.chars = null;
243    }
244
245    /**
246     * Constructs a tokenizer splitting on space, tab, newline and formfeed
247     * as per StringTokenizer.
248     *
249     * @param input  the string which is to be parsed
250     */
251    public StrTokenizer(final String input) {
252        super();
253        if (input != null) {
254            chars = input.toCharArray();
255        } else {
256            chars = null;
257        }
258    }
259
260    /**
261     * Constructs a tokenizer splitting on the specified delimiter character.
262     *
263     * @param input  the string which is to be parsed
264     * @param delim  the field delimiter character
265     */
266    public StrTokenizer(final String input, final char delim) {
267        this(input);
268        setDelimiterChar(delim);
269    }
270
271    /**
272     * Constructs a tokenizer splitting on the specified delimiter string.
273     *
274     * @param input  the string which is to be parsed
275     * @param delim  the field delimiter string
276     */
277    public StrTokenizer(final String input, final String delim) {
278        this(input);
279        setDelimiterString(delim);
280    }
281
282    /**
283     * Constructs a tokenizer splitting using the specified delimiter matcher.
284     *
285     * @param input  the string which is to be parsed
286     * @param delim  the field delimiter matcher
287     */
288    public StrTokenizer(final String input, final StrMatcher delim) {
289        this(input);
290        setDelimiterMatcher(delim);
291    }
292
293    /**
294     * Constructs a tokenizer splitting on the specified delimiter character
295     * and handling quotes using the specified quote character.
296     *
297     * @param input  the string which is to be parsed
298     * @param delim  the field delimiter character
299     * @param quote  the field quoted string character
300     */
301    public StrTokenizer(final String input, final char delim, final char quote) {
302        this(input, delim);
303        setQuoteChar(quote);
304    }
305
306    /**
307     * Constructs a tokenizer splitting using the specified delimiter matcher
308     * and handling quotes using the specified quote matcher.
309     *
310     * @param input  the string which is to be parsed
311     * @param delim  the field delimiter matcher
312     * @param quote  the field quoted string matcher
313     */
314    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
315        this(input, delim);
316        setQuoteMatcher(quote);
317    }
318
319    /**
320     * Constructs a tokenizer splitting on space, tab, newline and formfeed
321     * as per StringTokenizer.
322     *
323     * @param input  the string which is to be parsed, not cloned
324     */
325    public StrTokenizer(final char[] input) {
326        super();
327        this.chars = ArrayUtils.clone(input);
328    }
329
330    /**
331     * Constructs a tokenizer splitting on the specified character.
332     *
333     * @param input  the string which is to be parsed, not cloned
334     * @param delim the field delimiter character
335     */
336    public StrTokenizer(final char[] input, final char delim) {
337        this(input);
338        setDelimiterChar(delim);
339    }
340
341    /**
342     * Constructs a tokenizer splitting on the specified string.
343     *
344     * @param input  the string which is to be parsed, not cloned
345     * @param delim the field delimiter string
346     */
347    public StrTokenizer(final char[] input, final String delim) {
348        this(input);
349        setDelimiterString(delim);
350    }
351
352    /**
353     * Constructs a tokenizer splitting using the specified delimiter matcher.
354     *
355     * @param input  the string which is to be parsed, not cloned
356     * @param delim  the field delimiter matcher
357     */
358    public StrTokenizer(final char[] input, final StrMatcher delim) {
359        this(input);
360        setDelimiterMatcher(delim);
361    }
362
363    /**
364     * Constructs a tokenizer splitting on the specified delimiter character
365     * and handling quotes using the specified quote character.
366     *
367     * @param input  the string which is to be parsed, not cloned
368     * @param delim  the field delimiter character
369     * @param quote  the field quoted string character
370     */
371    public StrTokenizer(final char[] input, final char delim, final char quote) {
372        this(input, delim);
373        setQuoteChar(quote);
374    }
375
376    /**
377     * Constructs a tokenizer splitting using the specified delimiter matcher
378     * and handling quotes using the specified quote matcher.
379     *
380     * @param input  the string which is to be parsed, not cloned
381     * @param delim  the field delimiter character
382     * @param quote  the field quoted string character
383     */
384    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
385        this(input, delim);
386        setQuoteMatcher(quote);
387    }
388
389    // API
390    //-----------------------------------------------------------------------
391    /**
392     * Gets the number of tokens found in the String.
393     *
394     * @return the number of matched tokens
395     */
396    public int size() {
397        checkTokenized();
398        return tokens.length;
399    }
400
401    /**
402     * Gets the next token from the String.
403     * Equivalent to {@link #next()} except it returns null rather than
404     * throwing {@link NoSuchElementException} when no tokens remain.
405     *
406     * @return the next sequential token, or null when no more tokens are found
407     */
408    public String nextToken() {
409        if (hasNext()) {
410            return tokens[tokenPos++];
411        }
412        return null;
413    }
414
415    /**
416     * Gets the previous token from the String.
417     *
418     * @return the previous sequential token, or null when no more tokens are found
419     */
420    public String previousToken() {
421        if (hasPrevious()) {
422            return tokens[--tokenPos];
423        }
424        return null;
425    }
426
427    /**
428     * Gets a copy of the full token list as an independent modifiable array.
429     *
430     * @return the tokens as a String array
431     */
432    public String[] getTokenArray() {
433        checkTokenized();
434        return tokens.clone();
435    }
436
437    /**
438     * Gets a copy of the full token list as an independent modifiable list.
439     *
440     * @return the tokens as a String array
441     */
442    public List<String> getTokenList() {
443        checkTokenized();
444        final List<String> list = new ArrayList<String>(tokens.length);
445        for (final String element : tokens) {
446            list.add(element);
447        }
448        return list;
449    }
450
451    /**
452     * Resets this tokenizer, forgetting all parsing and iteration already completed.
453     * <p>
454     * This method allows the same tokenizer to be reused for the same String.
455     *
456     * @return this, to enable chaining
457     */
458    public StrTokenizer reset() {
459        tokenPos = 0;
460        tokens = null;
461        return this;
462    }
463
464    /**
465     * Reset this tokenizer, giving it a new input string to parse.
466     * In this manner you can re-use a tokenizer with the same settings
467     * on multiple input lines.
468     *
469     * @param input  the new string to tokenize, null sets no text to parse
470     * @return this, to enable chaining
471     */
472    public StrTokenizer reset(final String input) {
473        reset();
474        if (input != null) {
475            this.chars = input.toCharArray();
476        } else {
477            this.chars = null;
478        }
479        return this;
480    }
481
482    /**
483     * Reset this tokenizer, giving it a new input string to parse.
484     * In this manner you can re-use a tokenizer with the same settings
485     * on multiple input lines.
486     *
487     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
488     * @return this, to enable chaining
489     */
490    public StrTokenizer reset(final char[] input) {
491        reset();
492        this.chars = ArrayUtils.clone(input);
493        return this;
494    }
495
496    // ListIterator
497    //-----------------------------------------------------------------------
498    /**
499     * Checks whether there are any more tokens.
500     *
501     * @return true if there are more tokens
502     */
503    @Override
504    public boolean hasNext() {
505        checkTokenized();
506        return tokenPos < tokens.length;
507    }
508
509    /**
510     * Gets the next token.
511     *
512     * @return the next String token
513     * @throws NoSuchElementException if there are no more elements
514     */
515    @Override
516    public String next() {
517        if (hasNext()) {
518            return tokens[tokenPos++];
519        }
520        throw new NoSuchElementException();
521    }
522
523    /**
524     * Gets the index of the next token to return.
525     *
526     * @return the next token index
527     */
528    @Override
529    public int nextIndex() {
530        return tokenPos;
531    }
532
533    /**
534     * Checks whether there are any previous tokens that can be iterated to.
535     *
536     * @return true if there are previous tokens
537     */
538    @Override
539    public boolean hasPrevious() {
540        checkTokenized();
541        return tokenPos > 0;
542    }
543
544    /**
545     * Gets the token previous to the last returned token.
546     *
547     * @return the previous token
548     */
549    @Override
550    public String previous() {
551        if (hasPrevious()) {
552            return tokens[--tokenPos];
553        }
554        throw new NoSuchElementException();
555    }
556
557    /**
558     * Gets the index of the previous token.
559     *
560     * @return the previous token index
561     */
562    @Override
563    public int previousIndex() {
564        return tokenPos - 1;
565    }
566
567    /**
568     * Unsupported ListIterator operation.
569     *
570     * @throws UnsupportedOperationException always
571     */
572    @Override
573    public void remove() {
574        throw new UnsupportedOperationException("remove() is unsupported");
575    }
576
577    /**
578     * Unsupported ListIterator operation.
579     * @param obj this parameter ignored.
580     * @throws UnsupportedOperationException always
581     */
582    @Override
583    public void set(final String obj) {
584        throw new UnsupportedOperationException("set() is unsupported");
585    }
586
587    /**
588     * Unsupported ListIterator operation.
589     * @param obj this parameter ignored.
590     * @throws UnsupportedOperationException always
591     */
592    @Override
593    public void add(final String obj) {
594        throw new UnsupportedOperationException("add() is unsupported");
595    }
596
597    // Implementation
598    //-----------------------------------------------------------------------
599    /**
600     * Checks if tokenization has been done, and if not then do it.
601     */
602    private void checkTokenized() {
603        if (tokens == null) {
604            if (chars == null) {
605                // still call tokenize as subclass may do some work
606                final List<String> split = tokenize(null, 0, 0);
607                tokens = split.toArray(new String[split.size()]);
608            } else {
609                final List<String> split = tokenize(chars, 0, chars.length);
610                tokens = split.toArray(new String[split.size()]);
611            }
612        }
613    }
614
615    /**
616     * Internal method to performs the tokenization.
617     * <p>
618     * Most users of this class do not need to call this method. This method
619     * will be called automatically by other (public) methods when required.
620     * <p>
621     * This method exists to allow subclasses to add code before or after the
622     * tokenization. For example, a subclass could alter the character array,
623     * offset or count to be parsed, or call the tokenizer multiple times on
624     * multiple strings. It is also be possible to filter the results.
625     * <p>
626     * <code>StrTokenizer</code> will always pass a zero offset and a count
627     * equal to the length of the array to this method, however a subclass
628     * may pass other values, or even an entirely different array.
629     * 
630     * @param srcChars  the character array being tokenized, may be null
631     * @param offset  the start position within the character array, must be valid
632     * @param count  the number of characters to tokenize, must be valid
633     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
634     */
635    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
636        if (srcChars == null || count == 0) {
637            return Collections.emptyList();
638        }
639        final StrBuilder buf = new StrBuilder();
640        final List<String> tokenList = new ArrayList<String>();
641        int pos = offset;
642        
643        // loop around the entire buffer
644        while (pos >= 0 && pos < count) {
645            // find next token
646            pos = readNextToken(srcChars, pos, count, buf, tokenList);
647            
648            // handle case where end of string is a delimiter
649            if (pos >= count) {
650                addToken(tokenList, "");
651            }
652        }
653        return tokenList;
654    }
655
656    /**
657     * Adds a token to a list, paying attention to the parameters we've set.
658     *
659     * @param list  the list to add to
660     * @param tok  the token to add
661     */
662    private void addToken(final List<String> list, String tok) {
663        if (StringUtils.isEmpty(tok)) {
664            if (isIgnoreEmptyTokens()) {
665                return;
666            }
667            if (isEmptyTokenAsNull()) {
668                tok = null;
669            }
670        }
671        list.add(tok);
672    }
673
674    /**
675     * Reads character by character through the String to get the next token.
676     *
677     * @param srcChars  the character array being tokenized
678     * @param start  the first character of field
679     * @param len  the length of the character array being tokenized
680     * @param workArea  a temporary work area
681     * @param tokenList  the list of parsed tokens
682     * @return the starting position of the next field (the character
683     *  immediately after the delimiter), or -1 if end of string found
684     */
685    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
686        // skip all leading whitespace, unless it is the
687        // field delimiter or the quote character
688        while (start < len) {
689            final int removeLen = Math.max(
690                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
691                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
692            if (removeLen == 0 ||
693                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
694                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
695                break;
696            }
697            start += removeLen;
698        }
699        
700        // handle reaching end
701        if (start >= len) {
702            addToken(tokenList, "");
703            return -1;
704        }
705        
706        // handle empty token
707        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
708        if (delimLen > 0) {
709            addToken(tokenList, "");
710            return start + delimLen;
711        }
712        
713        // handle found token
714        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
715        if (quoteLen > 0) {
716            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
717        }
718        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
719    }
720
721    /**
722     * Reads a possibly quoted string token.
723     *
724     * @param srcChars  the character array being tokenized
725     * @param start  the first character of field
726     * @param len  the length of the character array being tokenized
727     * @param workArea  a temporary work area
728     * @param tokenList  the list of parsed tokens
729     * @param quoteStart  the start position of the matched quote, 0 if no quoting
730     * @param quoteLen  the length of the matched quote, 0 if no quoting
731     * @return the starting position of the next field (the character
732     *  immediately after the delimiter, or if end of string found,
733     *  then the length of string
734     */
735    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 
736                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
737        // Loop until we've found the end of the quoted
738        // string or the end of the input
739        workArea.clear();
740        int pos = start;
741        boolean quoting = quoteLen > 0;
742        int trimStart = 0;
743        
744        while (pos < len) {
745            // quoting mode can occur several times throughout a string
746            // we must switch between quoting and non-quoting until we
747            // encounter a non-quoted delimiter, or end of string
748            if (quoting) {
749                // In quoting mode
750                
751                // If we've found a quote character, see if it's
752                // followed by a second quote.  If so, then we need
753                // to actually put the quote character into the token
754                // rather than end the token.
755                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
756                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
757                        // matched pair of quotes, thus an escaped quote
758                        workArea.append(srcChars, pos, quoteLen);
759                        pos += quoteLen * 2;
760                        trimStart = workArea.size();
761                        continue;
762                    }
763                    
764                    // end of quoting
765                    quoting = false;
766                    pos += quoteLen;
767                    continue;
768                }
769                
770                // copy regular character from inside quotes
771                workArea.append(srcChars[pos++]);
772                trimStart = workArea.size();
773                
774            } else {
775                // Not in quoting mode
776                
777                // check for delimiter, and thus end of token
778                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
779                if (delimLen > 0) {
780                    // return condition when end of token found
781                    addToken(tokenList, workArea.substring(0, trimStart));
782                    return pos + delimLen;
783                }
784                
785                // check for quote, and thus back into quoting mode
786                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
787                    quoting = true;
788                    pos += quoteLen;
789                    continue;
790                }
791                
792                // check for ignored (outside quotes), and ignore
793                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
794                if (ignoredLen > 0) {
795                    pos += ignoredLen;
796                    continue;
797                }
798                
799                // check for trimmed character
800                // don't yet know if its at the end, so copy to workArea
801                // use trimStart to keep track of trim at the end
802                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
803                if (trimmedLen > 0) {
804                    workArea.append(srcChars, pos, trimmedLen);
805                    pos += trimmedLen;
806                    continue;
807                }
808                
809                // copy regular character from outside quotes
810                workArea.append(srcChars[pos++]);
811                trimStart = workArea.size();
812            }
813        }
814        
815        // return condition when end of string found
816        addToken(tokenList, workArea.substring(0, trimStart));
817        return -1;
818    }
819
820    /**
821     * Checks if the characters at the index specified match the quote
822     * already matched in readNextToken().
823     *
824     * @param srcChars  the character array being tokenized
825     * @param pos  the position to check for a quote
826     * @param len  the length of the character array being tokenized
827     * @param quoteStart  the start position of the matched quote, 0 if no quoting
828     * @param quoteLen  the length of the matched quote, 0 if no quoting
829     * @return true if a quote is matched
830     */
831    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
832        for (int i = 0; i < quoteLen; i++) {
833            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
834                return false;
835            }
836        }
837        return true;
838    }
839
840    // Delimiter
841    //-----------------------------------------------------------------------
842    /**
843     * Gets the field delimiter matcher.
844     *
845     * @return the delimiter matcher in use
846     */
847    public StrMatcher getDelimiterMatcher() {
848        return this.delimMatcher;
849    }
850
851    /**
852     * Sets the field delimiter matcher.
853     * <p>
854     * The delimitier is used to separate one token from another.
855     *
856     * @param delim  the delimiter matcher to use
857     * @return this, to enable chaining
858     */
859    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
860        if (delim == null) {
861            this.delimMatcher = StrMatcher.noneMatcher();
862        } else {
863            this.delimMatcher = delim;
864        }
865        return this;
866    }
867
868    /**
869     * Sets the field delimiter character.
870     *
871     * @param delim  the delimiter character to use
872     * @return this, to enable chaining
873     */
874    public StrTokenizer setDelimiterChar(final char delim) {
875        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
876    }
877
878    /**
879     * Sets the field delimiter string.
880     *
881     * @param delim  the delimiter string to use
882     * @return this, to enable chaining
883     */
884    public StrTokenizer setDelimiterString(final String delim) {
885        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
886    }
887
888    // Quote
889    //-----------------------------------------------------------------------
890    /**
891     * Gets the quote matcher currently in use.
892     * <p>
893     * The quote character is used to wrap data between the tokens.
894     * This enables delimiters to be entered as data.
895     * The default value is '"' (double quote).
896     *
897     * @return the quote matcher in use
898     */
899    public StrMatcher getQuoteMatcher() {
900        return quoteMatcher;
901    }
902
903    /**
904     * Set the quote matcher to use.
905     * <p>
906     * The quote character is used to wrap data between the tokens.
907     * This enables delimiters to be entered as data.
908     *
909     * @param quote  the quote matcher to use, null ignored
910     * @return this, to enable chaining
911     */
912    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
913        if (quote != null) {
914            this.quoteMatcher = quote;
915        }
916        return this;
917    }
918
919    /**
920     * Sets the quote character to use.
921     * <p>
922     * The quote character is used to wrap data between the tokens.
923     * This enables delimiters to be entered as data.
924     *
925     * @param quote  the quote character to use
926     * @return this, to enable chaining
927     */
928    public StrTokenizer setQuoteChar(final char quote) {
929        return setQuoteMatcher(StrMatcher.charMatcher(quote));
930    }
931
932    // Ignored
933    //-----------------------------------------------------------------------
934    /**
935     * Gets the ignored character matcher.
936     * <p>
937     * These characters are ignored when parsing the String, unless they are
938     * within a quoted region.
939     * The default value is not to ignore anything.
940     *
941     * @return the ignored matcher in use
942     */
943    public StrMatcher getIgnoredMatcher() {
944        return ignoredMatcher;
945    }
946
947    /**
948     * Set the matcher for characters to ignore.
949     * <p>
950     * These characters are ignored when parsing the String, unless they are
951     * within a quoted region.
952     *
953     * @param ignored  the ignored matcher to use, null ignored
954     * @return this, to enable chaining
955     */
956    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
957        if (ignored != null) {
958            this.ignoredMatcher = ignored;
959        }
960        return this;
961    }
962
963    /**
964     * Set the character to ignore.
965     * <p>
966     * This character is ignored when parsing the String, unless it is
967     * within a quoted region.
968     *
969     * @param ignored  the ignored character to use
970     * @return this, to enable chaining
971     */
972    public StrTokenizer setIgnoredChar(final char ignored) {
973        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974    }
975
976    // Trimmer
977    //-----------------------------------------------------------------------
978    /**
979     * Gets the trimmer character matcher.
980     * <p>
981     * These characters are trimmed off on each side of the delimiter
982     * until the token or quote is found.
983     * The default value is not to trim anything.
984     *
985     * @return the trimmer matcher in use
986     */
987    public StrMatcher getTrimmerMatcher() {
988        return trimmerMatcher;
989    }
990
991    /**
992     * Sets the matcher for characters to trim.
993     * <p>
994     * These characters are trimmed off on each side of the delimiter
995     * until the token or quote is found.
996     *
997     * @param trimmer  the trimmer matcher to use, null ignored
998     * @return this, to enable chaining
999     */
1000    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1001        if (trimmer != null) {
1002            this.trimmerMatcher = trimmer;
1003        }
1004        return this;
1005    }
1006
1007    //-----------------------------------------------------------------------
1008    /**
1009     * Gets whether the tokenizer currently returns empty tokens as null.
1010     * The default for this property is false.
1011     *
1012     * @return true if empty tokens are returned as null
1013     */
1014    public boolean isEmptyTokenAsNull() {
1015        return this.emptyAsNull;
1016    }
1017
1018    /**
1019     * Sets whether the tokenizer should return empty tokens as null.
1020     * The default for this property is false.
1021     *
1022     * @param emptyAsNull  whether empty tokens are returned as null
1023     * @return this, to enable chaining
1024     */
1025    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1026        this.emptyAsNull = emptyAsNull;
1027        return this;
1028    }
1029
1030    //-----------------------------------------------------------------------
1031    /**
1032     * Gets whether the tokenizer currently ignores empty tokens.
1033     * The default for this property is true.
1034     *
1035     * @return true if empty tokens are not returned
1036     */
1037    public boolean isIgnoreEmptyTokens() {
1038        return ignoreEmptyTokens;
1039    }
1040
1041    /**
1042     * Sets whether the tokenizer should ignore and not return empty tokens.
1043     * The default for this property is true.
1044     *
1045     * @param ignoreEmptyTokens  whether empty tokens are not returned
1046     * @return this, to enable chaining
1047     */
1048    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1049        this.ignoreEmptyTokens = ignoreEmptyTokens;
1050        return this;
1051    }
1052
1053    //-----------------------------------------------------------------------
1054    /**
1055     * Gets the String content that the tokenizer is parsing.
1056     *
1057     * @return the string content being parsed
1058     */
1059    public String getContent() {
1060        if (chars == null) {
1061            return null;
1062        }
1063        return new String(chars);
1064    }
1065
1066    //-----------------------------------------------------------------------
1067    /**
1068     * Creates a new instance of this Tokenizer. The new instance is reset so
1069     * that it will be at the start of the token list.
1070     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1071     * 
1072     * @return a new instance of this Tokenizer which has been reset.
1073     */
1074    @Override
1075    public Object clone() {
1076        try {
1077            return cloneReset();
1078        } catch (final CloneNotSupportedException ex) {
1079            return null;
1080        }
1081    }
1082
1083    /**
1084     * Creates a new instance of this Tokenizer. The new instance is reset so that
1085     * it will be at the start of the token list.
1086     * 
1087     * @return a new instance of this Tokenizer which has been reset.
1088     * @throws CloneNotSupportedException if there is a problem cloning
1089     */
1090    Object cloneReset() throws CloneNotSupportedException {
1091        // this method exists to enable 100% test coverage
1092        final StrTokenizer cloned = (StrTokenizer) super.clone();
1093        if (cloned.chars != null) {
1094            cloned.chars = cloned.chars.clone();
1095        }
1096        cloned.reset();
1097        return cloned;
1098    }
1099
1100    //-----------------------------------------------------------------------
1101    /**
1102     * Gets the String content that the tokenizer is parsing.
1103     *
1104     * @return the string content being parsed
1105     */
1106    @Override
1107    public String toString() {
1108        if (tokens == null) {
1109            return "StrTokenizer[not tokenized yet]";
1110        }
1111        return "StrTokenizer" + getTokenList();
1112    }
1113
1114}