Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.ArrayUtils;
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * Tokenizes a string based based on delimiters (separators)
030 * and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims
033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
034 * however it offers much more control and flexibility including implementing
035 * the <code>ListIterator</code> interface. By default, it is set up
036 * like <code>StringTokenizer</code>.
037 * <p>
038 * The input String is split into a number of <i>tokens</i>.
039 * Each token is separated from the next String by a <i>delimiter</i>.
040 * One or more delimiter characters must be specified.
041 * <p>
042 * Each token may be surrounded by quotes.
043 * The <i>quote</i> matcher specifies the quote character(s).
044 * A quote may be escaped within a quoted section by duplicating itself.
045 * <p>
046 * Between each token and the delimiter are potentially characters that need trimming.
047 * The <i>trimmer</i> matcher specifies these characters.
048 * One usage might be to trim whitespace characters.
049 * <p>
050 * At any point outside the quotes there might potentially be invalid characters.
051 * The <i>ignored</i> matcher specifies these characters to be removed.
052 * One usage might be to remove new line characters.
053 * <p>
054 * Empty tokens may be removed or returned as null.
055 * <pre>
056 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
057 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
059 * </pre>
060 * <p>
061 *
062 * This tokenizer has the following properties and options:
063 *
064 * <table summary="Tokenizer Properties">
065 *  <tr>
066 *   <th>Property</th><th>Type</th><th>Default</th>
067 *  </tr>
068 *  <tr>
069 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
070 *  </tr>
071 *  <tr>
072 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
073 *  </tr>
074 *  <tr>
075 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
082 *  </tr>
083 * </table>
084 *
085 * @since 2.2
086 */
087public class StrTokenizer implements ListIterator<String>, Cloneable {
088
089    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
090    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
091    static {
092        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
093        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
094        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
095        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
096        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
097        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
098        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
099
100        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
101        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
102        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
103        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
104        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
105        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
106        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
107    }
108
109    /** The text to work on. */
110    private char chars[];
111    /** The parsed tokens */
112    private String tokens[];
113    /** The current iteration position */
114    private int tokenPos;
115
116    /** The delimiter matcher */
117    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
118    /** The quote matcher */
119    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
120    /** The ignored matcher */
121    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
122    /** The trimmer matcher */
123    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
124
125    /** Whether to return empty tokens as null */
126    private boolean emptyAsNull = false;
127    /** Whether to ignore empty tokens */
128    private boolean ignoreEmptyTokens = true;
129
130    //-----------------------------------------------------------------------
131
132    /**
133     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
134     * 
135     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
136     */
137    private static StrTokenizer getCSVClone() {
138        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
139    }
140
141    /**
142     * Gets a new tokenizer instance which parses Comma Separated Value strings
143     * initializing it with the given input.  The default for CSV processing
144     * will be trim whitespace from both ends (which can be overridden with
145     * the setTrimmer method).
146     * <p>
147     * You must call a "reset" method to set the string which you want to parse.
148     * @return a new tokenizer instance which parses Comma Separated Value strings
149     */
150    public static StrTokenizer getCSVInstance() {
151        return getCSVClone();
152    }
153
154    /**
155     * Gets a new tokenizer instance which parses Comma Separated Value strings
156     * initializing it with the given input.  The default for CSV processing
157     * will be trim whitespace from both ends (which can be overridden with
158     * the setTrimmer method).
159     *
160     * @param input  the text to parse
161     * @return a new tokenizer instance which parses Comma Separated Value strings
162     */
163    public static StrTokenizer getCSVInstance(final String input) {
164        final StrTokenizer tok = getCSVClone();
165        tok.reset(input);
166        return tok;
167    }
168
169    /**
170     * Gets a new tokenizer instance which parses Comma Separated Value strings
171     * initializing it with the given input.  The default for CSV processing
172     * will be trim whitespace from both ends (which can be overridden with
173     * the setTrimmer method).
174     *
175     * @param input  the text to parse
176     * @return a new tokenizer instance which parses Comma Separated Value strings
177     */
178    public static StrTokenizer getCSVInstance(final char[] input) {
179        final StrTokenizer tok = getCSVClone();
180        tok.reset(input);
181        return tok;
182    }
183
184    /**
185     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
186     * 
187     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
188     */
189    private static StrTokenizer getTSVClone() {
190        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
191    }
192
193
194    /**
195     * Gets a new tokenizer instance which parses Tab Separated Value strings.
196     * The default for CSV processing will be trim whitespace from both ends
197     * (which can be overridden with the setTrimmer method).
198     * <p>
199     * You must call a "reset" method to set the string which you want to parse.
200     * @return a new tokenizer instance which parses Tab Separated Value strings.
201     */
202    public static StrTokenizer getTSVInstance() {
203        return getTSVClone();
204    }
205
206    /**
207     * Gets a new tokenizer instance which parses Tab Separated Value strings.
208     * The default for CSV processing will be trim whitespace from both ends
209     * (which can be overridden with the setTrimmer method).
210     * @param input  the string to parse
211     * @return a new tokenizer instance which parses Tab Separated Value strings.
212     */
213    public static StrTokenizer getTSVInstance(final String input) {
214        final StrTokenizer tok = getTSVClone();
215        tok.reset(input);
216        return tok;
217    }
218
219    /**
220     * Gets a new tokenizer instance which parses Tab Separated Value strings.
221     * The default for CSV processing will be trim whitespace from both ends
222     * (which can be overridden with the setTrimmer method).
223     * @param input  the string to parse
224     * @return a new tokenizer instance which parses Tab Separated Value strings.
225     */
226    public static StrTokenizer getTSVInstance(final char[] input) {
227        final StrTokenizer tok = getTSVClone();
228        tok.reset(input);
229        return tok;
230    }
231
232    //-----------------------------------------------------------------------
233    /**
234     * Constructs a tokenizer splitting on space, tab, newline and formfeed
235     * as per StringTokenizer, but with no text to tokenize.
236     * <p>
237     * This constructor is normally used with {@link #reset(String)}.
238     */
239    public StrTokenizer() {
240        super();
241        this.chars = null;
242    }
243
244    /**
245     * Constructs a tokenizer splitting on space, tab, newline and formfeed
246     * as per StringTokenizer.
247     *
248     * @param input  the string which is to be parsed
249     */
250    public StrTokenizer(final String input) {
251        super();
252        if (input != null) {
253            chars = input.toCharArray();
254        } else {
255            chars = null;
256        }
257    }
258
259    /**
260     * Constructs a tokenizer splitting on the specified delimiter character.
261     *
262     * @param input  the string which is to be parsed
263     * @param delim  the field delimiter character
264     */
265    public StrTokenizer(final String input, final char delim) {
266        this(input);
267        setDelimiterChar(delim);
268    }
269
270    /**
271     * Constructs a tokenizer splitting on the specified delimiter string.
272     *
273     * @param input  the string which is to be parsed
274     * @param delim  the field delimiter string
275     */
276    public StrTokenizer(final String input, final String delim) {
277        this(input);
278        setDelimiterString(delim);
279    }
280
281    /**
282     * Constructs a tokenizer splitting using the specified delimiter matcher.
283     *
284     * @param input  the string which is to be parsed
285     * @param delim  the field delimiter matcher
286     */
287    public StrTokenizer(final String input, final StrMatcher delim) {
288        this(input);
289        setDelimiterMatcher(delim);
290    }
291
292    /**
293     * Constructs a tokenizer splitting on the specified delimiter character
294     * and handling quotes using the specified quote character.
295     *
296     * @param input  the string which is to be parsed
297     * @param delim  the field delimiter character
298     * @param quote  the field quoted string character
299     */
300    public StrTokenizer(final String input, final char delim, final char quote) {
301        this(input, delim);
302        setQuoteChar(quote);
303    }
304
305    /**
306     * Constructs a tokenizer splitting using the specified delimiter matcher
307     * and handling quotes using the specified quote matcher.
308     *
309     * @param input  the string which is to be parsed
310     * @param delim  the field delimiter matcher
311     * @param quote  the field quoted string matcher
312     */
313    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
314        this(input, delim);
315        setQuoteMatcher(quote);
316    }
317
318    /**
319     * Constructs a tokenizer splitting on space, tab, newline and formfeed
320     * as per StringTokenizer.
321     *
322     * @param input  the string which is to be parsed, not cloned
323     */
324    public StrTokenizer(final char[] input) {
325        super();
326        this.chars = ArrayUtils.clone(input);
327    }
328
329    /**
330     * Constructs a tokenizer splitting on the specified character.
331     *
332     * @param input  the string which is to be parsed, not cloned
333     * @param delim the field delimiter character
334     */
335    public StrTokenizer(final char[] input, final char delim) {
336        this(input);
337        setDelimiterChar(delim);
338    }
339
340    /**
341     * Constructs a tokenizer splitting on the specified string.
342     *
343     * @param input  the string which is to be parsed, not cloned
344     * @param delim the field delimiter string
345     */
346    public StrTokenizer(final char[] input, final String delim) {
347        this(input);
348        setDelimiterString(delim);
349    }
350
351    /**
352     * Constructs a tokenizer splitting using the specified delimiter matcher.
353     *
354     * @param input  the string which is to be parsed, not cloned
355     * @param delim  the field delimiter matcher
356     */
357    public StrTokenizer(final char[] input, final StrMatcher delim) {
358        this(input);
359        setDelimiterMatcher(delim);
360    }
361
362    /**
363     * Constructs a tokenizer splitting on the specified delimiter character
364     * and handling quotes using the specified quote character.
365     *
366     * @param input  the string which is to be parsed, not cloned
367     * @param delim  the field delimiter character
368     * @param quote  the field quoted string character
369     */
370    public StrTokenizer(final char[] input, final char delim, final char quote) {
371        this(input, delim);
372        setQuoteChar(quote);
373    }
374
375    /**
376     * Constructs a tokenizer splitting using the specified delimiter matcher
377     * and handling quotes using the specified quote matcher.
378     *
379     * @param input  the string which is to be parsed, not cloned
380     * @param delim  the field delimiter character
381     * @param quote  the field quoted string character
382     */
383    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
384        this(input, delim);
385        setQuoteMatcher(quote);
386    }
387
388    // API
389    //-----------------------------------------------------------------------
390    /**
391     * Gets the number of tokens found in the String.
392     *
393     * @return the number of matched tokens
394     */
395    public int size() {
396        checkTokenized();
397        return tokens.length;
398    }
399
400    /**
401     * Gets the next token from the String.
402     * Equivalent to {@link #next()} except it returns null rather than
403     * throwing {@link NoSuchElementException} when no tokens remain.
404     *
405     * @return the next sequential token, or null when no more tokens are found
406     */
407    public String nextToken() {
408        if (hasNext()) {
409            return tokens[tokenPos++];
410        }
411        return null;
412    }
413
414    /**
415     * Gets the previous token from the String.
416     *
417     * @return the previous sequential token, or null when no more tokens are found
418     */
419    public String previousToken() {
420        if (hasPrevious()) {
421            return tokens[--tokenPos];
422        }
423        return null;
424    }
425
426    /**
427     * Gets a copy of the full token list as an independent modifiable array.
428     *
429     * @return the tokens as a String array
430     */
431    public String[] getTokenArray() {
432        checkTokenized();
433        return tokens.clone();
434    }
435
436    /**
437     * Gets a copy of the full token list as an independent modifiable list.
438     *
439     * @return the tokens as a String array
440     */
441    public List<String> getTokenList() {
442        checkTokenized();
443        final List<String> list = new ArrayList<String>(tokens.length);
444        for (final String element : tokens) {
445            list.add(element);
446        }
447        return list;
448    }
449
450    /**
451     * Resets this tokenizer, forgetting all parsing and iteration already completed.
452     * <p>
453     * This method allows the same tokenizer to be reused for the same String.
454     *
455     * @return this, to enable chaining
456     */
457    public StrTokenizer reset() {
458        tokenPos = 0;
459        tokens = null;
460        return this;
461    }
462
463    /**
464     * Reset this tokenizer, giving it a new input string to parse.
465     * In this manner you can re-use a tokenizer with the same settings
466     * on multiple input lines.
467     *
468     * @param input  the new string to tokenize, null sets no text to parse
469     * @return this, to enable chaining
470     */
471    public StrTokenizer reset(final String input) {
472        reset();
473        if (input != null) {
474            this.chars = input.toCharArray();
475        } else {
476            this.chars = null;
477        }
478        return this;
479    }
480
481    /**
482     * Reset this tokenizer, giving it a new input string to parse.
483     * In this manner you can re-use a tokenizer with the same settings
484     * on multiple input lines.
485     *
486     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
487     * @return this, to enable chaining
488     */
489    public StrTokenizer reset(final char[] input) {
490        reset();
491        this.chars = ArrayUtils.clone(input);
492        return this;
493    }
494
495    // ListIterator
496    //-----------------------------------------------------------------------
497    /**
498     * Checks whether there are any more tokens.
499     *
500     * @return true if there are more tokens
501     */
502    @Override
503    public boolean hasNext() {
504        checkTokenized();
505        return tokenPos < tokens.length;
506    }
507
508    /**
509     * Gets the next token.
510     *
511     * @return the next String token
512     * @throws NoSuchElementException if there are no more elements
513     */
514    @Override
515    public String next() {
516        if (hasNext()) {
517            return tokens[tokenPos++];
518        }
519        throw new NoSuchElementException();
520    }
521
522    /**
523     * Gets the index of the next token to return.
524     *
525     * @return the next token index
526     */
527    @Override
528    public int nextIndex() {
529        return tokenPos;
530    }
531
532    /**
533     * Checks whether there are any previous tokens that can be iterated to.
534     *
535     * @return true if there are previous tokens
536     */
537    @Override
538    public boolean hasPrevious() {
539        checkTokenized();
540        return tokenPos > 0;
541    }
542
543    /**
544     * Gets the token previous to the last returned token.
545     *
546     * @return the previous token
547     */
548    @Override
549    public String previous() {
550        if (hasPrevious()) {
551            return tokens[--tokenPos];
552        }
553        throw new NoSuchElementException();
554    }
555
556    /**
557     * Gets the index of the previous token.
558     *
559     * @return the previous token index
560     */
561    @Override
562    public int previousIndex() {
563        return tokenPos - 1;
564    }
565
566    /**
567     * Unsupported ListIterator operation.
568     *
569     * @throws UnsupportedOperationException always
570     */
571    @Override
572    public void remove() {
573        throw new UnsupportedOperationException("remove() is unsupported");
574    }
575
576    /**
577     * Unsupported ListIterator operation.
578     * @param obj this parameter ignored.
579     * @throws UnsupportedOperationException always
580     */
581    @Override
582    public void set(final String obj) {
583        throw new UnsupportedOperationException("set() is unsupported");
584    }
585
586    /**
587     * Unsupported ListIterator operation.
588     * @param obj this parameter ignored.
589     * @throws UnsupportedOperationException always
590     */
591    @Override
592    public void add(final String obj) {
593        throw new UnsupportedOperationException("add() is unsupported");
594    }
595
596    // Implementation
597    //-----------------------------------------------------------------------
598    /**
599     * Checks if tokenization has been done, and if not then do it.
600     */
601    private void checkTokenized() {
602        if (tokens == null) {
603            if (chars == null) {
604                // still call tokenize as subclass may do some work
605                final List<String> split = tokenize(null, 0, 0);
606                tokens = split.toArray(new String[split.size()]);
607            } else {
608                final List<String> split = tokenize(chars, 0, chars.length);
609                tokens = split.toArray(new String[split.size()]);
610            }
611        }
612    }
613
614    /**
615     * Internal method to performs the tokenization.
616     * <p>
617     * Most users of this class do not need to call this method. This method
618     * will be called automatically by other (public) methods when required.
619     * <p>
620     * This method exists to allow subclasses to add code before or after the
621     * tokenization. For example, a subclass could alter the character array,
622     * offset or count to be parsed, or call the tokenizer multiple times on
623     * multiple strings. It is also be possible to filter the results.
624     * <p>
625     * <code>StrTokenizer</code> will always pass a zero offset and a count
626     * equal to the length of the array to this method, however a subclass
627     * may pass other values, or even an entirely different array.
628     * 
629     * @param srcChars  the character array being tokenized, may be null
630     * @param offset  the start position within the character array, must be valid
631     * @param count  the number of characters to tokenize, must be valid
632     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
633     */
634    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
635        if (srcChars == null || count == 0) {
636            return Collections.emptyList();
637        }
638        final StrBuilder buf = new StrBuilder();
639        final List<String> tokenList = new ArrayList<String>();
640        int pos = offset;
641        
642        // loop around the entire buffer
643        while (pos >= 0 && pos < count) {
644            // find next token
645            pos = readNextToken(srcChars, pos, count, buf, tokenList);
646            
647            // handle case where end of string is a delimiter
648            if (pos >= count) {
649                addToken(tokenList, StringUtils.EMPTY);
650            }
651        }
652        return tokenList;
653    }
654
655    /**
656     * Adds a token to a list, paying attention to the parameters we've set.
657     *
658     * @param list  the list to add to
659     * @param tok  the token to add
660     */
661    private void addToken(final List<String> list, String tok) {
662        if (StringUtils.isEmpty(tok)) {
663            if (isIgnoreEmptyTokens()) {
664                return;
665            }
666            if (isEmptyTokenAsNull()) {
667                tok = null;
668            }
669        }
670        list.add(tok);
671    }
672
673    /**
674     * Reads character by character through the String to get the next token.
675     *
676     * @param srcChars  the character array being tokenized
677     * @param start  the first character of field
678     * @param len  the length of the character array being tokenized
679     * @param workArea  a temporary work area
680     * @param tokenList  the list of parsed tokens
681     * @return the starting position of the next field (the character
682     *  immediately after the delimiter), or -1 if end of string found
683     */
684    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
685        // skip all leading whitespace, unless it is the
686        // field delimiter or the quote character
687        while (start < len) {
688            final int removeLen = Math.max(
689                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
690                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
691            if (removeLen == 0 ||
692                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
693                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
694                break;
695            }
696            start += removeLen;
697        }
698        
699        // handle reaching end
700        if (start >= len) {
701            addToken(tokenList, StringUtils.EMPTY);
702            return -1;
703        }
704        
705        // handle empty token
706        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
707        if (delimLen > 0) {
708            addToken(tokenList, StringUtils.EMPTY);
709            return start + delimLen;
710        }
711        
712        // handle found token
713        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
714        if (quoteLen > 0) {
715            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
716        }
717        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
718    }
719
720    /**
721     * Reads a possibly quoted string token.
722     *
723     * @param srcChars  the character array being tokenized
724     * @param start  the first character of field
725     * @param len  the length of the character array being tokenized
726     * @param workArea  a temporary work area
727     * @param tokenList  the list of parsed tokens
728     * @param quoteStart  the start position of the matched quote, 0 if no quoting
729     * @param quoteLen  the length of the matched quote, 0 if no quoting
730     * @return the starting position of the next field (the character
731     *  immediately after the delimiter, or if end of string found,
732     *  then the length of string
733     */
734    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 
735                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
736        // Loop until we've found the end of the quoted
737        // string or the end of the input
738        workArea.clear();
739        int pos = start;
740        boolean quoting = quoteLen > 0;
741        int trimStart = 0;
742        
743        while (pos < len) {
744            // quoting mode can occur several times throughout a string
745            // we must switch between quoting and non-quoting until we
746            // encounter a non-quoted delimiter, or end of string
747            if (quoting) {
748                // In quoting mode
749                
750                // If we've found a quote character, see if it's
751                // followed by a second quote.  If so, then we need
752                // to actually put the quote character into the token
753                // rather than end the token.
754                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
755                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
756                        // matched pair of quotes, thus an escaped quote
757                        workArea.append(srcChars, pos, quoteLen);
758                        pos += quoteLen * 2;
759                        trimStart = workArea.size();
760                        continue;
761                    }
762                    
763                    // end of quoting
764                    quoting = false;
765                    pos += quoteLen;
766                    continue;
767                }
768                
769                // copy regular character from inside quotes
770                workArea.append(srcChars[pos++]);
771                trimStart = workArea.size();
772                
773            } else {
774                // Not in quoting mode
775                
776                // check for delimiter, and thus end of token
777                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
778                if (delimLen > 0) {
779                    // return condition when end of token found
780                    addToken(tokenList, workArea.substring(0, trimStart));
781                    return pos + delimLen;
782                }
783                
784                // check for quote, and thus back into quoting mode
785                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
786                    quoting = true;
787                    pos += quoteLen;
788                    continue;
789                }
790                
791                // check for ignored (outside quotes), and ignore
792                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
793                if (ignoredLen > 0) {
794                    pos += ignoredLen;
795                    continue;
796                }
797                
798                // check for trimmed character
799                // don't yet know if its at the end, so copy to workArea
800                // use trimStart to keep track of trim at the end
801                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
802                if (trimmedLen > 0) {
803                    workArea.append(srcChars, pos, trimmedLen);
804                    pos += trimmedLen;
805                    continue;
806                }
807                
808                // copy regular character from outside quotes
809                workArea.append(srcChars[pos++]);
810                trimStart = workArea.size();
811            }
812        }
813        
814        // return condition when end of string found
815        addToken(tokenList, workArea.substring(0, trimStart));
816        return -1;
817    }
818
819    /**
820     * Checks if the characters at the index specified match the quote
821     * already matched in readNextToken().
822     *
823     * @param srcChars  the character array being tokenized
824     * @param pos  the position to check for a quote
825     * @param len  the length of the character array being tokenized
826     * @param quoteStart  the start position of the matched quote, 0 if no quoting
827     * @param quoteLen  the length of the matched quote, 0 if no quoting
828     * @return true if a quote is matched
829     */
830    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
831        for (int i = 0; i < quoteLen; i++) {
832            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
833                return false;
834            }
835        }
836        return true;
837    }
838
839    // Delimiter
840    //-----------------------------------------------------------------------
841    /**
842     * Gets the field delimiter matcher.
843     *
844     * @return the delimiter matcher in use
845     */
846    public StrMatcher getDelimiterMatcher() {
847        return this.delimMatcher;
848    }
849
850    /**
851     * Sets the field delimiter matcher.
852     * <p>
853     * The delimitier is used to separate one token from another.
854     *
855     * @param delim  the delimiter matcher to use
856     * @return this, to enable chaining
857     */
858    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
859        if (delim == null) {
860            this.delimMatcher = StrMatcher.noneMatcher();
861        } else {
862            this.delimMatcher = delim;
863        }
864        return this;
865    }
866
867    /**
868     * Sets the field delimiter character.
869     *
870     * @param delim  the delimiter character to use
871     * @return this, to enable chaining
872     */
873    public StrTokenizer setDelimiterChar(final char delim) {
874        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
875    }
876
877    /**
878     * Sets the field delimiter string.
879     *
880     * @param delim  the delimiter string to use
881     * @return this, to enable chaining
882     */
883    public StrTokenizer setDelimiterString(final String delim) {
884        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
885    }
886
887    // Quote
888    //-----------------------------------------------------------------------
889    /**
890     * Gets the quote matcher currently in use.
891     * <p>
892     * The quote character is used to wrap data between the tokens.
893     * This enables delimiters to be entered as data.
894     * The default value is '"' (double quote).
895     *
896     * @return the quote matcher in use
897     */
898    public StrMatcher getQuoteMatcher() {
899        return quoteMatcher;
900    }
901
902    /**
903     * Set the quote matcher to use.
904     * <p>
905     * The quote character is used to wrap data between the tokens.
906     * This enables delimiters to be entered as data.
907     *
908     * @param quote  the quote matcher to use, null ignored
909     * @return this, to enable chaining
910     */
911    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
912        if (quote != null) {
913            this.quoteMatcher = quote;
914        }
915        return this;
916    }
917
918    /**
919     * Sets the quote character to use.
920     * <p>
921     * The quote character is used to wrap data between the tokens.
922     * This enables delimiters to be entered as data.
923     *
924     * @param quote  the quote character to use
925     * @return this, to enable chaining
926     */
927    public StrTokenizer setQuoteChar(final char quote) {
928        return setQuoteMatcher(StrMatcher.charMatcher(quote));
929    }
930
931    // Ignored
932    //-----------------------------------------------------------------------
933    /**
934     * Gets the ignored character matcher.
935     * <p>
936     * These characters are ignored when parsing the String, unless they are
937     * within a quoted region.
938     * The default value is not to ignore anything.
939     *
940     * @return the ignored matcher in use
941     */
942    public StrMatcher getIgnoredMatcher() {
943        return ignoredMatcher;
944    }
945
946    /**
947     * Set the matcher for characters to ignore.
948     * <p>
949     * These characters are ignored when parsing the String, unless they are
950     * within a quoted region.
951     *
952     * @param ignored  the ignored matcher to use, null ignored
953     * @return this, to enable chaining
954     */
955    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
956        if (ignored != null) {
957            this.ignoredMatcher = ignored;
958        }
959        return this;
960    }
961
962    /**
963     * Set the character to ignore.
964     * <p>
965     * This character is ignored when parsing the String, unless it is
966     * within a quoted region.
967     *
968     * @param ignored  the ignored character to use
969     * @return this, to enable chaining
970     */
971    public StrTokenizer setIgnoredChar(final char ignored) {
972        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
973    }
974
975    // Trimmer
976    //-----------------------------------------------------------------------
977    /**
978     * Gets the trimmer character matcher.
979     * <p>
980     * These characters are trimmed off on each side of the delimiter
981     * until the token or quote is found.
982     * The default value is not to trim anything.
983     *
984     * @return the trimmer matcher in use
985     */
986    public StrMatcher getTrimmerMatcher() {
987        return trimmerMatcher;
988    }
989
990    /**
991     * Sets the matcher for characters to trim.
992     * <p>
993     * These characters are trimmed off on each side of the delimiter
994     * until the token or quote is found.
995     *
996     * @param trimmer  the trimmer matcher to use, null ignored
997     * @return this, to enable chaining
998     */
999    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1000        if (trimmer != null) {
1001            this.trimmerMatcher = trimmer;
1002        }
1003        return this;
1004    }
1005
1006    //-----------------------------------------------------------------------
1007    /**
1008     * Gets whether the tokenizer currently returns empty tokens as null.
1009     * The default for this property is false.
1010     *
1011     * @return true if empty tokens are returned as null
1012     */
1013    public boolean isEmptyTokenAsNull() {
1014        return this.emptyAsNull;
1015    }
1016
1017    /**
1018     * Sets whether the tokenizer should return empty tokens as null.
1019     * The default for this property is false.
1020     *
1021     * @param emptyAsNull  whether empty tokens are returned as null
1022     * @return this, to enable chaining
1023     */
1024    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1025        this.emptyAsNull = emptyAsNull;
1026        return this;
1027    }
1028
1029    //-----------------------------------------------------------------------
1030    /**
1031     * Gets whether the tokenizer currently ignores empty tokens.
1032     * The default for this property is true.
1033     *
1034     * @return true if empty tokens are not returned
1035     */
1036    public boolean isIgnoreEmptyTokens() {
1037        return ignoreEmptyTokens;
1038    }
1039
1040    /**
1041     * Sets whether the tokenizer should ignore and not return empty tokens.
1042     * The default for this property is true.
1043     *
1044     * @param ignoreEmptyTokens  whether empty tokens are not returned
1045     * @return this, to enable chaining
1046     */
1047    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1048        this.ignoreEmptyTokens = ignoreEmptyTokens;
1049        return this;
1050    }
1051
1052    //-----------------------------------------------------------------------
1053    /**
1054     * Gets the String content that the tokenizer is parsing.
1055     *
1056     * @return the string content being parsed
1057     */
1058    public String getContent() {
1059        if (chars == null) {
1060            return null;
1061        }
1062        return new String(chars);
1063    }
1064
1065    //-----------------------------------------------------------------------
1066    /**
1067     * Creates a new instance of this Tokenizer. The new instance is reset so
1068     * that it will be at the start of the token list.
1069     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1070     * 
1071     * @return a new instance of this Tokenizer which has been reset.
1072     */
1073    @Override
1074    public Object clone() {
1075        try {
1076            return cloneReset();
1077        } catch (final CloneNotSupportedException ex) {
1078            return null;
1079        }
1080    }
1081
1082    /**
1083     * Creates a new instance of this Tokenizer. The new instance is reset so that
1084     * it will be at the start of the token list.
1085     * 
1086     * @return a new instance of this Tokenizer which has been reset.
1087     * @throws CloneNotSupportedException if there is a problem cloning
1088     */
1089    Object cloneReset() throws CloneNotSupportedException {
1090        // this method exists to enable 100% test coverage
1091        final StrTokenizer cloned = (StrTokenizer) super.clone();
1092        if (cloned.chars != null) {
1093            cloned.chars = cloned.chars.clone();
1094        }
1095        cloned.reset();
1096        return cloned;
1097    }
1098
1099    //-----------------------------------------------------------------------
1100    /**
1101     * Gets the String content that the tokenizer is parsing.
1102     *
1103     * @return the string content being parsed
1104     */
1105    @Override
1106    public String toString() {
1107        if (tokens == null) {
1108            return "StrTokenizer[not tokenized yet]";
1109        }
1110        return "StrTokenizer" + getTokenList();
1111    }
1112
1113}