001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang3.text;
018    
019    import java.util.ArrayList;
020    import java.util.Collections;
021    import java.util.List;
022    import java.util.ListIterator;
023    import java.util.NoSuchElementException;
024    
025    import org.apache.commons.lang3.ArrayUtils;
026    
027    /**
028     * Tokenizes a string based based on delimiters (separators)
029     * and supporting quoting and ignored character concepts.
030     * <p>
031     * This class can split a String into many smaller strings. It aims
032     * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
033     * however it offers much more control and flexibility including implementing
034     * the <code>ListIterator</code> interface. By default, it is set up
035     * like <code>StringTokenizer</code>.
036     * <p>
037     * The input String is split into a number of <i>tokens</i>.
038     * Each token is separated from the next String by a <i>delimiter</i>.
039     * One or more delimiter characters must be specified.
040     * <p>
041     * Each token may be surrounded by quotes.
042     * The <i>quote</i> matcher specifies the quote character(s).
043     * A quote may be escaped within a quoted section by duplicating itself.
044     * <p>
045     * Between each token and the delimiter are potentially characters that need trimming.
046     * The <i>trimmer</i> matcher specifies these characters.
047     * One usage might be to trim whitespace characters.
048     * <p>
049     * At any point outside the quotes there might potentially be invalid characters.
050     * The <i>ignored</i> matcher specifies these characters to be removed.
051     * One usage might be to remove new line characters.
052     * <p>
053     * Empty tokens may be removed or returned as null.
054     * <pre>
055     * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
056     * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
057     * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
058     * </pre>
059     * <p>
060     *
061     * This tokenizer has the following properties and options:
062     *
063     * <table>
064     *  <tr>
065     *   <th>Property</th><th>Type</th><th>Default</th>
066     *  </tr>
067     *  <tr>
068     *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069     *  </tr>
070     *  <tr>
071     *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
072     *  </tr>
073     *  <tr>
074     *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075     *  </tr>
076     *  <tr>
077     *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078     *  </tr>
079     *  <tr>
080     *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081     *  </tr>
082     * </table>
083     *
084     * @since 2.2
085     * @version $Id: StrTokenizer.java 1153241 2011-08-02 18:49:52Z ggregory $
086     */
087    public class StrTokenizer implements ListIterator<String>, Cloneable {
088    
089        private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
090        private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
091        static {
092            CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
093            CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
094            CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
095            CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
096            CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
097            CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
098            CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
099    
100            TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
101            TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
102            TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
103            TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
104            TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
105            TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
106            TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
107        }
108    
109        /** The text to work on. */
110        private char chars[];
111        /** The parsed tokens */
112        private String tokens[];
113        /** The current iteration position */
114        private int tokenPos;
115    
116        /** The delimiter matcher */
117        private StrMatcher delimMatcher = StrMatcher.splitMatcher();
118        /** The quote matcher */
119        private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
120        /** The ignored matcher */
121        private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
122        /** The trimmer matcher */
123        private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
124    
125        /** Whether to return empty tokens as null */
126        private boolean emptyAsNull = false;
127        /** Whether to ignore empty tokens */
128        private boolean ignoreEmptyTokens = true;
129    
130        //-----------------------------------------------------------------------
131    
132        /**
133         * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
134         * 
135         * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
136         */
137        private static StrTokenizer getCSVClone() {
138            return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
139        }
140    
141        /**
142         * Gets a new tokenizer instance which parses Comma Separated Value strings
143         * initializing it with the given input.  The default for CSV processing
144         * will be trim whitespace from both ends (which can be overridden with
145         * the setTrimmer method).
146         * <p>
147         * You must call a "reset" method to set the string which you want to parse.
148         * @return a new tokenizer instance which parses Comma Separated Value strings
149         */
150        public static StrTokenizer getCSVInstance() {
151            return getCSVClone();
152        }
153    
154        /**
155         * Gets a new tokenizer instance which parses Comma Separated Value strings
156         * initializing it with the given input.  The default for CSV processing
157         * will be trim whitespace from both ends (which can be overridden with
158         * the setTrimmer method).
159         *
160         * @param input  the text to parse
161         * @return a new tokenizer instance which parses Comma Separated Value strings
162         */
163        public static StrTokenizer getCSVInstance(String input) {
164            StrTokenizer tok = getCSVClone();
165            tok.reset(input);
166            return tok;
167        }
168    
169        /**
170         * Gets a new tokenizer instance which parses Comma Separated Value strings
171         * initializing it with the given input.  The default for CSV processing
172         * will be trim whitespace from both ends (which can be overridden with
173         * the setTrimmer method).
174         *
175         * @param input  the text to parse
176         * @return a new tokenizer instance which parses Comma Separated Value strings
177         */
178        public static StrTokenizer getCSVInstance(char[] input) {
179            StrTokenizer tok = getCSVClone();
180            tok.reset(input);
181            return tok;
182        }
183    
184        /**
185         * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
186         * 
187         * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
188         */
189        private static StrTokenizer getTSVClone() {
190            return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
191        }
192    
193    
194        /**
195         * Gets a new tokenizer instance which parses Tab Separated Value strings.
196         * The default for CSV processing will be trim whitespace from both ends
197         * (which can be overridden with the setTrimmer method).
198         * <p>
199         * You must call a "reset" method to set the string which you want to parse.
200         * @return a new tokenizer instance which parses Tab Separated Value strings.
201         */
202        public static StrTokenizer getTSVInstance() {
203            return getTSVClone();
204        }
205    
206        /**
207         * Gets a new tokenizer instance which parses Tab Separated Value strings.
208         * The default for CSV processing will be trim whitespace from both ends
209         * (which can be overridden with the setTrimmer method).
210         * @param input  the string to parse
211         * @return a new tokenizer instance which parses Tab Separated Value strings.
212         */
213        public static StrTokenizer getTSVInstance(String input) {
214            StrTokenizer tok = getTSVClone();
215            tok.reset(input);
216            return tok;
217        }
218    
219        /**
220         * Gets a new tokenizer instance which parses Tab Separated Value strings.
221         * The default for CSV processing will be trim whitespace from both ends
222         * (which can be overridden with the setTrimmer method).
223         * @param input  the string to parse
224         * @return a new tokenizer instance which parses Tab Separated Value strings.
225         */
226        public static StrTokenizer getTSVInstance(char[] input) {
227            StrTokenizer tok = getTSVClone();
228            tok.reset(input);
229            return tok;
230        }
231    
232        //-----------------------------------------------------------------------
233        /**
234         * Constructs a tokenizer splitting on space, tab, newline and formfeed
235         * as per StringTokenizer, but with no text to tokenize.
236         * <p>
237         * This constructor is normally used with {@link #reset(String)}.
238         */
239        public StrTokenizer() {
240            super();
241            this.chars = null;
242        }
243    
244        /**
245         * Constructs a tokenizer splitting on space, tab, newline and formfeed
246         * as per StringTokenizer.
247         *
248         * @param input  the string which is to be parsed
249         */
250        public StrTokenizer(String input) {
251            super();
252            if (input != null) {
253                chars = input.toCharArray();
254            } else {
255                chars = null;
256            }
257        }
258    
259        /**
260         * Constructs a tokenizer splitting on the specified delimiter character.
261         *
262         * @param input  the string which is to be parsed
263         * @param delim  the field delimiter character
264         */
265        public StrTokenizer(String input, char delim) {
266            this(input);
267            setDelimiterChar(delim);
268        }
269    
270        /**
271         * Constructs a tokenizer splitting on the specified delimiter string.
272         *
273         * @param input  the string which is to be parsed
274         * @param delim  the field delimiter string
275         */
276        public StrTokenizer(String input, String delim) {
277            this(input);
278            setDelimiterString(delim);
279        }
280    
281        /**
282         * Constructs a tokenizer splitting using the specified delimiter matcher.
283         *
284         * @param input  the string which is to be parsed
285         * @param delim  the field delimiter matcher
286         */
287        public StrTokenizer(String input, StrMatcher delim) {
288            this(input);
289            setDelimiterMatcher(delim);
290        }
291    
292        /**
293         * Constructs a tokenizer splitting on the specified delimiter character
294         * and handling quotes using the specified quote character.
295         *
296         * @param input  the string which is to be parsed
297         * @param delim  the field delimiter character
298         * @param quote  the field quoted string character
299         */
300        public StrTokenizer(String input, char delim, char quote) {
301            this(input, delim);
302            setQuoteChar(quote);
303        }
304    
305        /**
306         * Constructs a tokenizer splitting using the specified delimiter matcher
307         * and handling quotes using the specified quote matcher.
308         *
309         * @param input  the string which is to be parsed
310         * @param delim  the field delimiter matcher
311         * @param quote  the field quoted string matcher
312         */
313        public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
314            this(input, delim);
315            setQuoteMatcher(quote);
316        }
317    
318        /**
319         * Constructs a tokenizer splitting on space, tab, newline and formfeed
320         * as per StringTokenizer.
321         *
322         * @param input  the string which is to be parsed, not cloned
323         */
324        public StrTokenizer(char[] input) {
325            super();
326            this.chars = ArrayUtils.clone(input);
327        }
328    
329        /**
330         * Constructs a tokenizer splitting on the specified character.
331         *
332         * @param input  the string which is to be parsed, not cloned
333         * @param delim the field delimiter character
334         */
335        public StrTokenizer(char[] input, char delim) {
336            this(input);
337            setDelimiterChar(delim);
338        }
339    
340        /**
341         * Constructs a tokenizer splitting on the specified string.
342         *
343         * @param input  the string which is to be parsed, not cloned
344         * @param delim the field delimiter string
345         */
346        public StrTokenizer(char[] input, String delim) {
347            this(input);
348            setDelimiterString(delim);
349        }
350    
351        /**
352         * Constructs a tokenizer splitting using the specified delimiter matcher.
353         *
354         * @param input  the string which is to be parsed, not cloned
355         * @param delim  the field delimiter matcher
356         */
357        public StrTokenizer(char[] input, StrMatcher delim) {
358            this(input);
359            setDelimiterMatcher(delim);
360        }
361    
362        /**
363         * Constructs a tokenizer splitting on the specified delimiter character
364         * and handling quotes using the specified quote character.
365         *
366         * @param input  the string which is to be parsed, not cloned
367         * @param delim  the field delimiter character
368         * @param quote  the field quoted string character
369         */
370        public StrTokenizer(char[] input, char delim, char quote) {
371            this(input, delim);
372            setQuoteChar(quote);
373        }
374    
375        /**
376         * Constructs a tokenizer splitting using the specified delimiter matcher
377         * and handling quotes using the specified quote matcher.
378         *
379         * @param input  the string which is to be parsed, not cloned
380         * @param delim  the field delimiter character
381         * @param quote  the field quoted string character
382         */
383        public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
384            this(input, delim);
385            setQuoteMatcher(quote);
386        }
387    
388        // API
389        //-----------------------------------------------------------------------
390        /**
391         * Gets the number of tokens found in the String.
392         *
393         * @return the number of matched tokens
394         */
395        public int size() {
396            checkTokenized();
397            return tokens.length;
398        }
399    
400        /**
401         * Gets the next token from the String.
402         * Equivalent to {@link #next()} except it returns null rather than
403         * throwing {@link NoSuchElementException} when no tokens remain.
404         *
405         * @return the next sequential token, or null when no more tokens are found
406         */
407        public String nextToken() {
408            if (hasNext()) {
409                return tokens[tokenPos++];
410            }
411            return null;
412        }
413    
414        /**
415         * Gets the previous token from the String.
416         *
417         * @return the previous sequential token, or null when no more tokens are found
418         */
419        public String previousToken() {
420            if (hasPrevious()) {
421                return tokens[--tokenPos];
422            }
423            return null;
424        }
425    
426        /**
427         * Gets a copy of the full token list as an independent modifiable array.
428         *
429         * @return the tokens as a String array
430         */
431        public String[] getTokenArray() {
432            checkTokenized();
433            return tokens.clone();
434        }
435    
436        /**
437         * Gets a copy of the full token list as an independent modifiable list.
438         *
439         * @return the tokens as a String array
440         */
441        public List<String> getTokenList() {
442            checkTokenized();
443            List<String> list = new ArrayList<String>(tokens.length);
444            for (String element : tokens) {
445                list.add(element);
446            }
447            return list;
448        }
449    
450        /**
451         * Resets this tokenizer, forgetting all parsing and iteration already completed.
452         * <p>
453         * This method allows the same tokenizer to be reused for the same String.
454         *
455         * @return this, to enable chaining
456         */
457        public StrTokenizer reset() {
458            tokenPos = 0;
459            tokens = null;
460            return this;
461        }
462    
463        /**
464         * Reset this tokenizer, giving it a new input string to parse.
465         * In this manner you can re-use a tokenizer with the same settings
466         * on multiple input lines.
467         *
468         * @param input  the new string to tokenize, null sets no text to parse
469         * @return this, to enable chaining
470         */
471        public StrTokenizer reset(String input) {
472            reset();
473            if (input != null) {
474                this.chars = input.toCharArray();
475            } else {
476                this.chars = null;
477            }
478            return this;
479        }
480    
481        /**
482         * Reset this tokenizer, giving it a new input string to parse.
483         * In this manner you can re-use a tokenizer with the same settings
484         * on multiple input lines.
485         *
486         * @param input  the new character array to tokenize, not cloned, null sets no text to parse
487         * @return this, to enable chaining
488         */
489        public StrTokenizer reset(char[] input) {
490            reset();
491            this.chars = ArrayUtils.clone(input);
492            return this;
493        }
494    
495        // ListIterator
496        //-----------------------------------------------------------------------
497        /**
498         * Checks whether there are any more tokens.
499         *
500         * @return true if there are more tokens
501         */
502        public boolean hasNext() {
503            checkTokenized();
504            return tokenPos < tokens.length;
505        }
506    
507        /**
508         * Gets the next token.
509         *
510         * @return the next String token
511         * @throws NoSuchElementException if there are no more elements
512         */
513        public String next() {
514            if (hasNext()) {
515                return tokens[tokenPos++];
516            }
517            throw new NoSuchElementException();
518        }
519    
520        /**
521         * Gets the index of the next token to return.
522         *
523         * @return the next token index
524         */
525        public int nextIndex() {
526            return tokenPos;
527        }
528    
529        /**
530         * Checks whether there are any previous tokens that can be iterated to.
531         *
532         * @return true if there are previous tokens
533         */
534        public boolean hasPrevious() {
535            checkTokenized();
536            return tokenPos > 0;
537        }
538    
539        /**
540         * Gets the token previous to the last returned token.
541         *
542         * @return the previous token
543         */
544        public String previous() {
545            if (hasPrevious()) {
546                return tokens[--tokenPos];
547            }
548            throw new NoSuchElementException();
549        }
550    
551        /**
552         * Gets the index of the previous token.
553         *
554         * @return the previous token index
555         */
556        public int previousIndex() {
557            return tokenPos - 1;
558        }
559    
560        /**
561         * Unsupported ListIterator operation.
562         *
563         * @throws UnsupportedOperationException always
564         */
565        public void remove() {
566            throw new UnsupportedOperationException("remove() is unsupported");
567        }
568    
569        /**
570         * Unsupported ListIterator operation.
571         * @param obj this parameter ignored.
572         * @throws UnsupportedOperationException always
573         */
574        public void set(String obj) {
575            throw new UnsupportedOperationException("set() is unsupported");
576        }
577    
578        /**
579         * Unsupported ListIterator operation.
580         * @param obj this parameter ignored.
581         * @throws UnsupportedOperationException always
582         */
583        public void add(String obj) {
584            throw new UnsupportedOperationException("add() is unsupported");
585        }
586    
587        // Implementation
588        //-----------------------------------------------------------------------
589        /**
590         * Checks if tokenization has been done, and if not then do it.
591         */
592        private void checkTokenized() {
593            if (tokens == null) {
594                if (chars == null) {
595                    // still call tokenize as subclass may do some work
596                    List<String> split = tokenize(null, 0, 0);
597                    tokens = split.toArray(new String[split.size()]);
598                } else {
599                    List<String> split = tokenize(chars, 0, chars.length);
600                    tokens = split.toArray(new String[split.size()]);
601                }
602            }
603        }
604    
605        /**
606         * Internal method to performs the tokenization.
607         * <p>
608         * Most users of this class do not need to call this method. This method
609         * will be called automatically by other (public) methods when required.
610         * <p>
611         * This method exists to allow subclasses to add code before or after the
612         * tokenization. For example, a subclass could alter the character array,
613         * offset or count to be parsed, or call the tokenizer multiple times on
614         * multiple strings. It is also be possible to filter the results.
615         * <p>
616         * <code>StrTokenizer</code> will always pass a zero offset and a count
617         * equal to the length of the array to this method, however a subclass
618         * may pass other values, or even an entirely different array.
619         * 
620         * @param chars  the character array being tokenized, may be null
621         * @param offset  the start position within the character array, must be valid
622         * @param count  the number of characters to tokenize, must be valid
623         * @return the modifiable list of String tokens, unmodifiable if null array or zero count
624         */
625        protected List<String> tokenize(char[] chars, int offset, int count) {
626            if (chars == null || count == 0) {
627                return Collections.emptyList();
628            }
629            StrBuilder buf = new StrBuilder();
630            List<String> tokens = new ArrayList<String>();
631            int pos = offset;
632            
633            // loop around the entire buffer
634            while (pos >= 0 && pos < count) {
635                // find next token
636                pos = readNextToken(chars, pos, count, buf, tokens);
637                
638                // handle case where end of string is a delimiter
639                if (pos >= count) {
640                    addToken(tokens, "");
641                }
642            }
643            return tokens;
644        }
645    
646        /**
647         * Adds a token to a list, paying attention to the parameters we've set.
648         *
649         * @param list  the list to add to
650         * @param tok  the token to add
651         */
652        private void addToken(List<String> list, String tok) {
653            if (tok == null || tok.length() == 0) {
654                if (isIgnoreEmptyTokens()) {
655                    return;
656                }
657                if (isEmptyTokenAsNull()) {
658                    tok = null;
659                }
660            }
661            list.add(tok);
662        }
663    
664        /**
665         * Reads character by character through the String to get the next token.
666         *
667         * @param chars  the character array being tokenized
668         * @param start  the first character of field
669         * @param len  the length of the character array being tokenized
670         * @param workArea  a temporary work area
671         * @param tokens  the list of parsed tokens
672         * @return the starting position of the next field (the character
673         *  immediately after the delimiter), or -1 if end of string found
674         */
675        private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) {
676            // skip all leading whitespace, unless it is the
677            // field delimiter or the quote character
678            while (start < len) {
679                int removeLen = Math.max(
680                        getIgnoredMatcher().isMatch(chars, start, start, len),
681                        getTrimmerMatcher().isMatch(chars, start, start, len));
682                if (removeLen == 0 ||
683                    getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
684                    getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
685                    break;
686                }
687                start += removeLen;
688            }
689            
690            // handle reaching end
691            if (start >= len) {
692                addToken(tokens, "");
693                return -1;
694            }
695            
696            // handle empty token
697            int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
698            if (delimLen > 0) {
699                addToken(tokens, "");
700                return start + delimLen;
701            }
702            
703            // handle found token
704            int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
705            if (quoteLen > 0) {
706                return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
707            }
708            return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
709        }
710    
711        /**
712         * Reads a possibly quoted string token.
713         *
714         * @param chars  the character array being tokenized
715         * @param start  the first character of field
716         * @param len  the length of the character array being tokenized
717         * @param workArea  a temporary work area
718         * @param tokens  the list of parsed tokens
719         * @param quoteStart  the start position of the matched quote, 0 if no quoting
720         * @param quoteLen  the length of the matched quote, 0 if no quoting
721         * @return the starting position of the next field (the character
722         *  immediately after the delimiter, or if end of string found,
723         *  then the length of string
724         */
725        private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
726                                   List<String> tokens, int quoteStart, int quoteLen) {
727            // Loop until we've found the end of the quoted
728            // string or the end of the input
729            workArea.clear();
730            int pos = start;
731            boolean quoting = (quoteLen > 0);
732            int trimStart = 0;
733            
734            while (pos < len) {
735                // quoting mode can occur several times throughout a string
736                // we must switch between quoting and non-quoting until we
737                // encounter a non-quoted delimiter, or end of string
738                if (quoting) {
739                    // In quoting mode
740                    
741                    // If we've found a quote character, see if it's
742                    // followed by a second quote.  If so, then we need
743                    // to actually put the quote character into the token
744                    // rather than end the token.
745                    if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
746                        if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
747                            // matched pair of quotes, thus an escaped quote
748                            workArea.append(chars, pos, quoteLen);
749                            pos += (quoteLen * 2);
750                            trimStart = workArea.size();
751                            continue;
752                        }
753                        
754                        // end of quoting
755                        quoting = false;
756                        pos += quoteLen;
757                        continue;
758                    }
759                    
760                    // copy regular character from inside quotes
761                    workArea.append(chars[pos++]);
762                    trimStart = workArea.size();
763                    
764                } else {
765                    // Not in quoting mode
766                    
767                    // check for delimiter, and thus end of token
768                    int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
769                    if (delimLen > 0) {
770                        // return condition when end of token found
771                        addToken(tokens, workArea.substring(0, trimStart));
772                        return pos + delimLen;
773                    }
774                    
775                    // check for quote, and thus back into quoting mode
776                    if (quoteLen > 0 && isQuote(chars, pos, len, quoteStart, quoteLen)) {
777                        quoting = true;
778                        pos += quoteLen;
779                        continue;
780                    }
781                    
782                    // check for ignored (outside quotes), and ignore
783                    int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
784                    if (ignoredLen > 0) {
785                        pos += ignoredLen;
786                        continue;
787                    }
788                    
789                    // check for trimmed character
790                    // don't yet know if its at the end, so copy to workArea
791                    // use trimStart to keep track of trim at the end
792                    int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
793                    if (trimmedLen > 0) {
794                        workArea.append(chars, pos, trimmedLen);
795                        pos += trimmedLen;
796                        continue;
797                    }
798                    
799                    // copy regular character from outside quotes
800                    workArea.append(chars[pos++]);
801                    trimStart = workArea.size();
802                }
803            }
804            
805            // return condition when end of string found
806            addToken(tokens, workArea.substring(0, trimStart));
807            return -1;
808        }
809    
810        /**
811         * Checks if the characters at the index specified match the quote
812         * already matched in readNextToken().
813         *
814         * @param chars  the character array being tokenized
815         * @param pos  the position to check for a quote
816         * @param len  the length of the character array being tokenized
817         * @param quoteStart  the start position of the matched quote, 0 if no quoting
818         * @param quoteLen  the length of the matched quote, 0 if no quoting
819         * @return true if a quote is matched
820         */
821        private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
822            for (int i = 0; i < quoteLen; i++) {
823                if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
824                    return false;
825                }
826            }
827            return true;
828        }
829    
830        // Delimiter
831        //-----------------------------------------------------------------------
832        /**
833         * Gets the field delimiter matcher.
834         *
835         * @return the delimiter matcher in use
836         */
837        public StrMatcher getDelimiterMatcher() {
838            return this.delimMatcher;
839        }
840    
841        /**
842         * Sets the field delimiter matcher.
843         * <p>
844         * The delimitier is used to separate one token from another.
845         *
846         * @param delim  the delimiter matcher to use
847         * @return this, to enable chaining
848         */
849        public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
850            if (delim == null) {
851                this.delimMatcher = StrMatcher.noneMatcher();
852            } else {
853                this.delimMatcher = delim;
854            }
855            return this;
856        }
857    
858        /**
859         * Sets the field delimiter character.
860         *
861         * @param delim  the delimiter character to use
862         * @return this, to enable chaining
863         */
864        public StrTokenizer setDelimiterChar(char delim) {
865            return setDelimiterMatcher(StrMatcher.charMatcher(delim));
866        }
867    
868        /**
869         * Sets the field delimiter string.
870         *
871         * @param delim  the delimiter string to use
872         * @return this, to enable chaining
873         */
874        public StrTokenizer setDelimiterString(String delim) {
875            return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
876        }
877    
878        // Quote
879        //-----------------------------------------------------------------------
880        /**
881         * Gets the quote matcher currently in use.
882         * <p>
883         * The quote character is used to wrap data between the tokens.
884         * This enables delimiters to be entered as data.
885         * The default value is '"' (double quote).
886         *
887         * @return the quote matcher in use
888         */
889        public StrMatcher getQuoteMatcher() {
890            return quoteMatcher;
891        }
892    
893        /**
894         * Set the quote matcher to use.
895         * <p>
896         * The quote character is used to wrap data between the tokens.
897         * This enables delimiters to be entered as data.
898         *
899         * @param quote  the quote matcher to use, null ignored
900         * @return this, to enable chaining
901         */
902        public StrTokenizer setQuoteMatcher(StrMatcher quote) {
903            if (quote != null) {
904                this.quoteMatcher = quote;
905            }
906            return this;
907        }
908    
909        /**
910         * Sets the quote character to use.
911         * <p>
912         * The quote character is used to wrap data between the tokens.
913         * This enables delimiters to be entered as data.
914         *
915         * @param quote  the quote character to use
916         * @return this, to enable chaining
917         */
918        public StrTokenizer setQuoteChar(char quote) {
919            return setQuoteMatcher(StrMatcher.charMatcher(quote));
920        }
921    
922        // Ignored
923        //-----------------------------------------------------------------------
924        /**
925         * Gets the ignored character matcher.
926         * <p>
927         * These characters are ignored when parsing the String, unless they are
928         * within a quoted region.
929         * The default value is not to ignore anything.
930         *
931         * @return the ignored matcher in use
932         */
933        public StrMatcher getIgnoredMatcher() {
934            return ignoredMatcher;
935        }
936    
937        /**
938         * Set the matcher for characters to ignore.
939         * <p>
940         * These characters are ignored when parsing the String, unless they are
941         * within a quoted region.
942         *
943         * @param ignored  the ignored matcher to use, null ignored
944         * @return this, to enable chaining
945         */
946        public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
947            if (ignored != null) {
948                this.ignoredMatcher = ignored;
949            }
950            return this;
951        }
952    
953        /**
954         * Set the character to ignore.
955         * <p>
956         * This character is ignored when parsing the String, unless it is
957         * within a quoted region.
958         *
959         * @param ignored  the ignored character to use
960         * @return this, to enable chaining
961         */
962        public StrTokenizer setIgnoredChar(char ignored) {
963            return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
964        }
965    
966        // Trimmer
967        //-----------------------------------------------------------------------
968        /**
969         * Gets the trimmer character matcher.
970         * <p>
971         * These characters are trimmed off on each side of the delimiter
972         * until the token or quote is found.
973         * The default value is not to trim anything.
974         *
975         * @return the trimmer matcher in use
976         */
977        public StrMatcher getTrimmerMatcher() {
978            return trimmerMatcher;
979        }
980    
981        /**
982         * Sets the matcher for characters to trim.
983         * <p>
984         * These characters are trimmed off on each side of the delimiter
985         * until the token or quote is found.
986         *
987         * @param trimmer  the trimmer matcher to use, null ignored
988         * @return this, to enable chaining
989         */
990        public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
991            if (trimmer != null) {
992                this.trimmerMatcher = trimmer;
993            }
994            return this;
995        }
996    
997        //-----------------------------------------------------------------------
998        /**
999         * Gets whether the tokenizer currently returns empty tokens as null.
1000         * The default for this property is false.
1001         *
1002         * @return true if empty tokens are returned as null
1003         */
1004        public boolean isEmptyTokenAsNull() {
1005            return this.emptyAsNull;
1006        }
1007    
1008        /**
1009         * Sets whether the tokenizer should return empty tokens as null.
1010         * The default for this property is false.
1011         *
1012         * @param emptyAsNull  whether empty tokens are returned as null
1013         * @return this, to enable chaining
1014         */
1015        public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1016            this.emptyAsNull = emptyAsNull;
1017            return this;
1018        }
1019    
1020        //-----------------------------------------------------------------------
1021        /**
1022         * Gets whether the tokenizer currently ignores empty tokens.
1023         * The default for this property is true.
1024         *
1025         * @return true if empty tokens are not returned
1026         */
1027        public boolean isIgnoreEmptyTokens() {
1028            return ignoreEmptyTokens;
1029        }
1030    
1031        /**
1032         * Sets whether the tokenizer should ignore and not return empty tokens.
1033         * The default for this property is true.
1034         *
1035         * @param ignoreEmptyTokens  whether empty tokens are not returned
1036         * @return this, to enable chaining
1037         */
1038        public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1039            this.ignoreEmptyTokens = ignoreEmptyTokens;
1040            return this;
1041        }
1042    
1043        //-----------------------------------------------------------------------
1044        /**
1045         * Gets the String content that the tokenizer is parsing.
1046         *
1047         * @return the string content being parsed
1048         */
1049        public String getContent() {
1050            if (chars == null) {
1051                return null;
1052            }
1053            return new String(chars);
1054        }
1055    
1056        //-----------------------------------------------------------------------
1057        /**
1058         * Creates a new instance of this Tokenizer. The new instance is reset so
1059         * that it will be at the start of the token list.
1060         * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1061         * 
1062         * @return a new instance of this Tokenizer which has been reset.
1063         */
1064        @Override
1065        public Object clone() {
1066            try {
1067                return cloneReset();
1068            } catch (CloneNotSupportedException ex) {
1069                return null;
1070            }
1071        }
1072    
1073        /**
1074         * Creates a new instance of this Tokenizer. The new instance is reset so that
1075         * it will be at the start of the token list.
1076         * 
1077         * @return a new instance of this Tokenizer which has been reset.
1078         * @throws CloneNotSupportedException if there is a problem cloning
1079         */
1080        Object cloneReset() throws CloneNotSupportedException {
1081            // this method exists to enable 100% test coverage
1082            StrTokenizer cloned = (StrTokenizer) super.clone();
1083            if (cloned.chars != null) {
1084                cloned.chars = cloned.chars.clone();
1085            }
1086            cloned.reset();
1087            return cloned;
1088        }
1089    
1090        //-----------------------------------------------------------------------
1091        /**
1092         * Gets the String content that the tokenizer is parsing.
1093         *
1094         * @return the string content being parsed
1095         */
1096        @Override
1097        public String toString() {
1098            if (tokens == null) {
1099                return "StrTokenizer[not tokenized yet]";
1100            }
1101            return "StrTokenizer" + getTokenList();
1102        }
1103    
1104    }