001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang3.text;
018    
019    import java.util.ArrayList;
020    import java.util.Collections;
021    import java.util.List;
022    import java.util.ListIterator;
023    import java.util.NoSuchElementException;
024    
025    import org.apache.commons.lang3.ArrayUtils;
026    
027    /**
028     * Tokenizes a string based based on delimiters (separators)
029     * and supporting quoting and ignored character concepts.
030     * <p>
031     * This class can split a String into many smaller strings. It aims
032     * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
033     * however it offers much more control and flexibility including implementing
034     * the <code>ListIterator</code> interface. By default, it is set up
035     * like <code>StringTokenizer</code>.
036     * <p>
037     * The input String is split into a number of <i>tokens</i>.
038     * Each token is separated from the next String by a <i>delimiter</i>.
039     * One or more delimiter characters must be specified.
040     * <p>
041     * Each token may be surrounded by quotes.
042     * The <i>quote</i> matcher specifies the quote character(s).
043     * A quote may be escaped within a quoted section by duplicating itself.
044     * <p>
045     * Between each token and the delimiter are potentially characters that need trimming.
046     * The <i>trimmer</i> matcher specifies these characters.
047     * One usage might be to trim whitespace characters.
048     * <p>
049     * At any point outside the quotes there might potentially be invalid characters.
050     * The <i>ignored</i> matcher specifies these characters to be removed.
051     * One usage might be to remove new line characters.
052     * <p>
053     * Empty tokens may be removed or returned as null.
054     * <pre>
055     * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
056     * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
057     * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
058     * </pre>
059     * <p>
060     *
061     * This tokenizer has the following properties and options:
062     *
063     * <table>
064     *  <tr>
065     *   <th>Property</th><th>Type</th><th>Default</th>
066     *  </tr>
067     *  <tr>
068     *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069     *  </tr>
070     *  <tr>
071     *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
072     *  </tr>
073     *  <tr>
074     *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075     *  </tr>
076     *  <tr>
077     *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078     *  </tr>
079     *  <tr>
080     *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081     *  </tr>
082     * </table>
083     *
084     * @since 2.2
085     * @version $Id: StrTokenizer.java 1088899 2011-04-05 05:31:27Z bayard $
086     */
087    public class StrTokenizer implements ListIterator<String>, Cloneable {
088    
089        private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
090        private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
091        static {
092            CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
093            CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
094            CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
095            CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
096            CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
097            CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
098            CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
099    
100            TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
101            TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
102            TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
103            TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
104            TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
105            TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
106            TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
107        }
108    
109        /** The text to work on. */
110        private char chars[];
111        /** The parsed tokens */
112        private String tokens[];
113        /** The current iteration position */
114        private int tokenPos;
115    
116        /** The delimiter matcher */
117        private StrMatcher delimMatcher = StrMatcher.splitMatcher();
118        /** The quote matcher */
119        private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
120        /** The ignored matcher */
121        private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
122        /** The trimmer matcher */
123        private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
124    
125        /** Whether to return empty tokens as null */
126        private boolean emptyAsNull = false;
127        /** Whether to ignore empty tokens */
128        private boolean ignoreEmptyTokens = true;
129    
130        //-----------------------------------------------------------------------
131    
132        /**
133         * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
134         * 
135         * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
136         */
137        private static StrTokenizer getCSVClone() {
138            return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
139        }
140    
141        /**
142         * Gets a new tokenizer instance which parses Comma Separated Value strings
143         * initializing it with the given input.  The default for CSV processing
144         * will be trim whitespace from both ends (which can be overridden with
145         * the setTrimmer method).
146         * <p>
147         * You must call a "reset" method to set the string which you want to parse.
148         * @return a new tokenizer instance which parses Comma Separated Value strings
149         */
150        public static StrTokenizer getCSVInstance() {
151            return getCSVClone();
152        }
153    
154        /**
155         * Gets a new tokenizer instance which parses Comma Separated Value strings
156         * initializing it with the given input.  The default for CSV processing
157         * will be trim whitespace from both ends (which can be overridden with
158         * the setTrimmer method).
159         *
160         * @param input  the text to parse
161         * @return a new tokenizer instance which parses Comma Separated Value strings
162         */
163        public static StrTokenizer getCSVInstance(String input) {
164            StrTokenizer tok = getCSVClone();
165            tok.reset(input);
166            return tok;
167        }
168    
169        /**
170         * Gets a new tokenizer instance which parses Comma Separated Value strings
171         * initializing it with the given input.  The default for CSV processing
172         * will be trim whitespace from both ends (which can be overridden with
173         * the setTrimmer method).
174         *
175         * @param input  the text to parse
176         * @return a new tokenizer instance which parses Comma Separated Value strings
177         */
178        public static StrTokenizer getCSVInstance(char[] input) {
179            StrTokenizer tok = getCSVClone();
180            tok.reset(input);
181            return tok;
182        }
183    
184        /**
185         * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
186         * 
187         * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
188         */
189        private static StrTokenizer getTSVClone() {
190            return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
191        }
192    
193    
194        /**
195         * Gets a new tokenizer instance which parses Tab Separated Value strings.
196         * The default for CSV processing will be trim whitespace from both ends
197         * (which can be overridden with the setTrimmer method).
198         * <p>
199         * You must call a "reset" method to set the string which you want to parse.
200         * @return a new tokenizer instance which parses Tab Separated Value strings.
201         */
202        public static StrTokenizer getTSVInstance() {
203            return getTSVClone();
204        }
205    
206        /**
207         * Gets a new tokenizer instance which parses Tab Separated Value strings.
208         * The default for CSV processing will be trim whitespace from both ends
209         * (which can be overridden with the setTrimmer method).
210         * @param input  the string to parse
211         * @return a new tokenizer instance which parses Tab Separated Value strings.
212         */
213        public static StrTokenizer getTSVInstance(String input) {
214            StrTokenizer tok = getTSVClone();
215            tok.reset(input);
216            return tok;
217        }
218    
219        /**
220         * Gets a new tokenizer instance which parses Tab Separated Value strings.
221         * The default for CSV processing will be trim whitespace from both ends
222         * (which can be overridden with the setTrimmer method).
223         * @param input  the string to parse
224         * @return a new tokenizer instance which parses Tab Separated Value strings.
225         */
226        public static StrTokenizer getTSVInstance(char[] input) {
227            StrTokenizer tok = getTSVClone();
228            tok.reset(input);
229            return tok;
230        }
231    
232        //-----------------------------------------------------------------------
233        /**
234         * Constructs a tokenizer splitting on space, tab, newline and formfeed
235         * as per StringTokenizer, but with no text to tokenize.
236         * <p>
237         * This constructor is normally used with {@link #reset(String)}.
238         */
239        public StrTokenizer() {
240            super();
241            this.chars = null;
242        }
243    
244        /**
245         * Constructs a tokenizer splitting on space, tab, newline and formfeed
246         * as per StringTokenizer.
247         *
248         * @param input  the string which is to be parsed
249         */
250        public StrTokenizer(String input) {
251            super();
252            if (input != null) {
253                chars = input.toCharArray();
254            } else {
255                chars = null;
256            }
257        }
258    
259        /**
260         * Constructs a tokenizer splitting on the specified delimiter character.
261         *
262         * @param input  the string which is to be parsed
263         * @param delim  the field delimiter character
264         */
265        public StrTokenizer(String input, char delim) {
266            this(input);
267            setDelimiterChar(delim);
268        }
269    
270        /**
271         * Constructs a tokenizer splitting on the specified delimiter string.
272         *
273         * @param input  the string which is to be parsed
274         * @param delim  the field delimiter string
275         */
276        public StrTokenizer(String input, String delim) {
277            this(input);
278            setDelimiterString(delim);
279        }
280    
281        /**
282         * Constructs a tokenizer splitting using the specified delimiter matcher.
283         *
284         * @param input  the string which is to be parsed
285         * @param delim  the field delimiter matcher
286         */
287        public StrTokenizer(String input, StrMatcher delim) {
288            this(input);
289            setDelimiterMatcher(delim);
290        }
291    
292        /**
293         * Constructs a tokenizer splitting on the specified delimiter character
294         * and handling quotes using the specified quote character.
295         *
296         * @param input  the string which is to be parsed
297         * @param delim  the field delimiter character
298         * @param quote  the field quoted string character
299         */
300        public StrTokenizer(String input, char delim, char quote) {
301            this(input, delim);
302            setQuoteChar(quote);
303        }
304    
305        /**
306         * Constructs a tokenizer splitting using the specified delimiter matcher
307         * and handling quotes using the specified quote matcher.
308         *
309         * @param input  the string which is to be parsed
310         * @param delim  the field delimiter matcher
311         * @param quote  the field quoted string matcher
312         */
313        public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
314            this(input, delim);
315            setQuoteMatcher(quote);
316        }
317    
318        /**
319         * Constructs a tokenizer splitting on space, tab, newline and formfeed
320         * as per StringTokenizer.
321         *
322         * @param input  the string which is to be parsed, not cloned
323         */
324        public StrTokenizer(char[] input) {
325            super();
326            this.chars = ArrayUtils.clone(input);
327        }
328    
329        /**
330         * Constructs a tokenizer splitting on the specified character.
331         *
332         * @param input  the string which is to be parsed, not cloned
333         * @param delim the field delimiter character
334         */
335        public StrTokenizer(char[] input, char delim) {
336            this(input);
337            setDelimiterChar(delim);
338        }
339    
340        /**
341         * Constructs a tokenizer splitting on the specified string.
342         *
343         * @param input  the string which is to be parsed, not cloned
344         * @param delim the field delimiter string
345         */
346        public StrTokenizer(char[] input, String delim) {
347            this(input);
348            setDelimiterString(delim);
349        }
350    
351        /**
352         * Constructs a tokenizer splitting using the specified delimiter matcher.
353         *
354         * @param input  the string which is to be parsed, not cloned
355         * @param delim  the field delimiter matcher
356         */
357        public StrTokenizer(char[] input, StrMatcher delim) {
358            this(input);
359            setDelimiterMatcher(delim);
360        }
361    
362        /**
363         * Constructs a tokenizer splitting on the specified delimiter character
364         * and handling quotes using the specified quote character.
365         *
366         * @param input  the string which is to be parsed, not cloned
367         * @param delim  the field delimiter character
368         * @param quote  the field quoted string character
369         */
370        public StrTokenizer(char[] input, char delim, char quote) {
371            this(input, delim);
372            setQuoteChar(quote);
373        }
374    
375        /**
376         * Constructs a tokenizer splitting using the specified delimiter matcher
377         * and handling quotes using the specified quote matcher.
378         *
379         * @param input  the string which is to be parsed, not cloned
380         * @param delim  the field delimiter character
381         * @param quote  the field quoted string character
382         */
383        public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
384            this(input, delim);
385            setQuoteMatcher(quote);
386        }
387    
388        // API
389        //-----------------------------------------------------------------------
390        /**
391         * Gets the number of tokens found in the String.
392         *
393         * @return the number of matched tokens
394         */
395        public int size() {
396            checkTokenized();
397            return tokens.length;
398        }
399    
400        /**
401         * Gets the next token from the String.
402         * Equivalent to {@link #next()} except it returns null rather than
403         * throwing {@link NoSuchElementException} when no tokens remain.
404         *
405         * @return the next sequential token, or null when no more tokens are found
406         */
407        public String nextToken() {
408            if (hasNext()) {
409                return tokens[tokenPos++];
410            }
411            return null;
412        }
413    
414        /**
415         * Gets the previous token from the String.
416         *
417         * @return the previous sequential token, or null when no more tokens are found
418         */
419        public String previousToken() {
420            if (hasPrevious()) {
421                return tokens[--tokenPos];
422            }
423            return null;
424        }
425    
426        /**
427         * Gets a copy of the full token list as an independent modifiable array.
428         *
429         * @return the tokens as a String array
430         */
431        public String[] getTokenArray() {
432            checkTokenized();
433            return tokens.clone();
434        }
435    
436        /**
437         * Gets a copy of the full token list as an independent modifiable list.
438         *
439         * @return the tokens as a String array
440         */
441        public List<String> getTokenList() {
442            checkTokenized();
443            List<String> list = new ArrayList<String>(tokens.length);
444            for (String element : tokens) {
445                list.add(element);
446            }
447            return list;
448        }
449    
450        /**
451         * Resets this tokenizer, forgetting all parsing and iteration already completed.
452         * <p>
453         * This method allows the same tokenizer to be reused for the same String.
454         *
455         * @return this, to enable chaining
456         */
457        public StrTokenizer reset() {
458            tokenPos = 0;
459            tokens = null;
460            return this;
461        }
462    
463        /**
464         * Reset this tokenizer, giving it a new input string to parse.
465         * In this manner you can re-use a tokenizer with the same settings
466         * on multiple input lines.
467         *
468         * @param input  the new string to tokenize, null sets no text to parse
469         * @return this, to enable chaining
470         */
471        public StrTokenizer reset(String input) {
472            reset();
473            if (input != null) {
474                this.chars = input.toCharArray();
475            } else {
476                this.chars = null;
477            }
478            return this;
479        }
480    
481        /**
482         * Reset this tokenizer, giving it a new input string to parse.
483         * In this manner you can re-use a tokenizer with the same settings
484         * on multiple input lines.
485         *
486         * @param input  the new character array to tokenize, not cloned, null sets no text to parse
487         * @return this, to enable chaining
488         */
489        public StrTokenizer reset(char[] input) {
490            reset();
491            this.chars = ArrayUtils.clone(input);
492            return this;
493        }
494    
495        // ListIterator
496        //-----------------------------------------------------------------------
497        /**
498         * Checks whether there are any more tokens.
499         *
500         * @return true if there are more tokens
501         */
502        public boolean hasNext() {
503            checkTokenized();
504            return tokenPos < tokens.length;
505        }
506    
507        /**
508         * Gets the next token.
509         *
510         * @return the next String token
511         * @throws NoSuchElementException if there are no more elements
512         */
513        public String next() {
514            if (hasNext()) {
515                return tokens[tokenPos++];
516            }
517            throw new NoSuchElementException();
518        }
519    
520        /**
521         * Gets the index of the next token to return.
522         *
523         * @return the next token index
524         */
525        public int nextIndex() {
526            return tokenPos;
527        }
528    
529        /**
530         * Checks whether there are any previous tokens that can be iterated to.
531         *
532         * @return true if there are previous tokens
533         */
534        public boolean hasPrevious() {
535            checkTokenized();
536            return tokenPos > 0;
537        }
538    
539        /**
540         * Gets the token previous to the last returned token.
541         *
542         * @return the previous token
543         */
544        public String previous() {
545            if (hasPrevious()) {
546                return tokens[--tokenPos];
547            }
548            throw new NoSuchElementException();
549        }
550    
551        /**
552         * Gets the index of the previous token.
553         *
554         * @return the previous token index
555         */
556        public int previousIndex() {
557            return tokenPos - 1;
558        }
559    
560        /**
561         * Unsupported ListIterator operation.
562         *
563         * @throws UnsupportedOperationException always
564         */
565        public void remove() {
566            throw new UnsupportedOperationException("remove() is unsupported");
567        }
568    
569        /**
570         * Unsupported ListIterator operation.
571         * @param obj this parameter ignored.
572         * @throws UnsupportedOperationException always
573         */
574        public void set(String obj) {
575            throw new UnsupportedOperationException("set() is unsupported");
576        }
577    
578        /**
579         * Unsupported ListIterator operation.
580         * @param obj this parameter ignored.
581         * @throws UnsupportedOperationException always
582         */
583        public void add(String obj) {
584            throw new UnsupportedOperationException("add() is unsupported");
585        }
586    
587        // Implementation
588        //-----------------------------------------------------------------------
589        /**
590         * Checks if tokenization has been done, and if not then do it.
591         */
592        private void checkTokenized() {
593            if (tokens == null) {
594                if (chars == null) {
595                    // still call tokenize as subclass may do some work
596                    List<String> split = tokenize(null, 0, 0);
597                    tokens = split.toArray(new String[split.size()]);
598                } else {
599                    List<String> split = tokenize(chars, 0, chars.length);
600                    tokens = split.toArray(new String[split.size()]);
601                }
602            }
603        }
604    
605        /**
606         * Internal method to performs the tokenization.
607         * <p>
608         * Most users of this class do not need to call this method. This method
609         * will be called automatically by other (public) methods when required.
610         * <p>
611         * This method exists to allow subclasses to add code before or after the
612         * tokenization. For example, a subclass could alter the character array,
613         * offset or count to be parsed, or call the tokenizer multiple times on
614         * multiple strings. It is also be possible to filter the results.
615         * <p>
616         * <code>StrTokenizer</code> will always pass a zero offset and a count
617         * equal to the length of the array to this method, however a subclass
618         * may pass other values, or even an entirely different array.
619         * 
620         * @param chars  the character array being tokenized, may be null
621         * @param offset  the start position within the character array, must be valid
622         * @param count  the number of characters to tokenize, must be valid
623         * @return the modifiable list of String tokens, unmodifiable if null array or zero count
624         */
625        protected List<String> tokenize(char[] chars, int offset, int count) {
626            if (chars == null || count == 0) {
627                return Collections.emptyList();
628            }
629            StrBuilder buf = new StrBuilder();
630            List<String> tokens = new ArrayList<String>();
631            int pos = offset;
632            
633            // loop around the entire buffer
634            while (pos >= 0 && pos < count) {
635                // find next token
636                pos = readNextToken(chars, pos, count, buf, tokens);
637                
638                // handle case where end of string is a delimiter
639                if (pos >= count) {
640                    addToken(tokens, "");
641                }
642            }
643            return tokens;
644        }
645    
646        /**
647         * Adds a token to a list, paying attention to the parameters we've set.
648         *
649         * @param list  the list to add to
650         * @param tok  the token to add
651         */
652        private void addToken(List<String> list, String tok) {
653            if (tok == null || tok.length() == 0) {
654                if (isIgnoreEmptyTokens()) {
655                    return;
656                }
657                if (isEmptyTokenAsNull()) {
658                    tok = null;
659                }
660            }
661            list.add(tok);
662        }
663    
664        /**
665         * Reads character by character through the String to get the next token.
666         *
667         * @param chars  the character array being tokenized
668         * @param start  the first character of field
669         * @param len  the length of the character array being tokenized
670         * @param workArea  a temporary work area
671         * @param tokens  the list of parsed tokens
672         * @return the starting position of the next field (the character
673         *  immediately after the delimiter), or -1 if end of string found
674         */
675        private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) {
676            // skip all leading whitespace, unless it is the
677            // field delimiter or the quote character
678            while (start < len) {
679                int removeLen = Math.max(
680                        getIgnoredMatcher().isMatch(chars, start, start, len),
681                        getTrimmerMatcher().isMatch(chars, start, start, len));
682                if (removeLen == 0 ||
683                    getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
684                    getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
685                    break;
686                }
687                start += removeLen;
688            }
689            
690            // handle reaching end
691            if (start >= len) {
692                addToken(tokens, "");
693                return -1;
694            }
695            
696            // handle empty token
697            int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
698            if (delimLen > 0) {
699                addToken(tokens, "");
700                return start + delimLen;
701            }
702            
703            // handle found token
704            int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
705            if (quoteLen > 0) {
706                return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
707            }
708            return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
709        }
710    
711        /**
712         * Reads a possibly quoted string token.
713         *
714         * @param chars  the character array being tokenized
715         * @param start  the first character of field
716         * @param len  the length of the character array being tokenized
717         * @param workArea  a temporary work area
718         * @param tokens  the list of parsed tokens
719         * @param quoteStart  the start position of the matched quote, 0 if no quoting
720         * @param quoteLen  the length of the matched quote, 0 if no quoting
721         * @return the starting position of the next field (the character
722         *  immediately after the delimiter, or if end of string found,
723         *  then the length of string
724         */
725        private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
726                                   List<String> tokens, int quoteStart, int quoteLen) {
727            // Loop until we've found the end of the quoted
728            // string or the end of the input
729            workArea.clear();
730            int pos = start;
731            boolean quoting = (quoteLen > 0);
732            int trimStart = 0;
733            
734            while (pos < len) {
735                // quoting mode can occur several times throughout a string
736                // we must switch between quoting and non-quoting until we
737                // encounter a non-quoted delimiter, or end of string
738                if (quoting) {
739                    // In quoting mode
740                    
741                    // If we've found a quote character, see if it's
742                    // followed by a second quote.  If so, then we need
743                    // to actually put the quote character into the token
744                    // rather than end the token.
745                    if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
746                        if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
747                            // matched pair of quotes, thus an escaped quote
748                            workArea.append(chars, pos, quoteLen);
749                            pos += (quoteLen * 2);
750                            trimStart = workArea.size();
751                            continue;
752                        }
753                        
754                        // end of quoting
755                        quoting = false;
756                        pos += quoteLen;
757                        continue;
758                    }
759                    
760                    // copy regular character from inside quotes
761                    workArea.append(chars[pos++]);
762                    trimStart = workArea.size();
763                    
764                } else {
765                    // Not in quoting mode
766                    
767                    // check for delimiter, and thus end of token
768                    int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
769                    if (delimLen > 0) {
770                        // return condition when end of token found
771                        addToken(tokens, workArea.substring(0, trimStart));
772                        return pos + delimLen;
773                    }
774                    
775                    // check for quote, and thus back into quoting mode
776                    if (quoteLen > 0) {
777                        if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
778                            quoting = true;
779                            pos += quoteLen;
780                            continue;
781                        }
782                    }
783                    
784                    // check for ignored (outside quotes), and ignore
785                    int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
786                    if (ignoredLen > 0) {
787                        pos += ignoredLen;
788                        continue;
789                    }
790                    
791                    // check for trimmed character
792                    // don't yet know if its at the end, so copy to workArea
793                    // use trimStart to keep track of trim at the end
794                    int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
795                    if (trimmedLen > 0) {
796                        workArea.append(chars, pos, trimmedLen);
797                        pos += trimmedLen;
798                        continue;
799                    }
800                    
801                    // copy regular character from outside quotes
802                    workArea.append(chars[pos++]);
803                    trimStart = workArea.size();
804                }
805            }
806            
807            // return condition when end of string found
808            addToken(tokens, workArea.substring(0, trimStart));
809            return -1;
810        }
811    
812        /**
813         * Checks if the characters at the index specified match the quote
814         * already matched in readNextToken().
815         *
816         * @param chars  the character array being tokenized
817         * @param pos  the position to check for a quote
818         * @param len  the length of the character array being tokenized
819         * @param quoteStart  the start position of the matched quote, 0 if no quoting
820         * @param quoteLen  the length of the matched quote, 0 if no quoting
821         * @return true if a quote is matched
822         */
823        private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
824            for (int i = 0; i < quoteLen; i++) {
825                if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
826                    return false;
827                }
828            }
829            return true;
830        }
831    
832        // Delimiter
833        //-----------------------------------------------------------------------
834        /**
835         * Gets the field delimiter matcher.
836         *
837         * @return the delimiter matcher in use
838         */
839        public StrMatcher getDelimiterMatcher() {
840            return this.delimMatcher;
841        }
842    
843        /**
844         * Sets the field delimiter matcher.
845         * <p>
846         * The delimitier is used to separate one token from another.
847         *
848         * @param delim  the delimiter matcher to use
849         * @return this, to enable chaining
850         */
851        public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
852            if (delim == null) {
853                this.delimMatcher = StrMatcher.noneMatcher();
854            } else {
855                this.delimMatcher = delim;
856            }
857            return this;
858        }
859    
860        /**
861         * Sets the field delimiter character.
862         *
863         * @param delim  the delimiter character to use
864         * @return this, to enable chaining
865         */
866        public StrTokenizer setDelimiterChar(char delim) {
867            return setDelimiterMatcher(StrMatcher.charMatcher(delim));
868        }
869    
870        /**
871         * Sets the field delimiter string.
872         *
873         * @param delim  the delimiter string to use
874         * @return this, to enable chaining
875         */
876        public StrTokenizer setDelimiterString(String delim) {
877            return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
878        }
879    
880        // Quote
881        //-----------------------------------------------------------------------
882        /**
883         * Gets the quote matcher currently in use.
884         * <p>
885         * The quote character is used to wrap data between the tokens.
886         * This enables delimiters to be entered as data.
887         * The default value is '"' (double quote).
888         *
889         * @return the quote matcher in use
890         */
891        public StrMatcher getQuoteMatcher() {
892            return quoteMatcher;
893        }
894    
895        /**
896         * Set the quote matcher to use.
897         * <p>
898         * The quote character is used to wrap data between the tokens.
899         * This enables delimiters to be entered as data.
900         *
901         * @param quote  the quote matcher to use, null ignored
902         * @return this, to enable chaining
903         */
904        public StrTokenizer setQuoteMatcher(StrMatcher quote) {
905            if (quote != null) {
906                this.quoteMatcher = quote;
907            }
908            return this;
909        }
910    
911        /**
912         * Sets the quote character to use.
913         * <p>
914         * The quote character is used to wrap data between the tokens.
915         * This enables delimiters to be entered as data.
916         *
917         * @param quote  the quote character to use
918         * @return this, to enable chaining
919         */
920        public StrTokenizer setQuoteChar(char quote) {
921            return setQuoteMatcher(StrMatcher.charMatcher(quote));
922        }
923    
924        // Ignored
925        //-----------------------------------------------------------------------
926        /**
927         * Gets the ignored character matcher.
928         * <p>
929         * These characters are ignored when parsing the String, unless they are
930         * within a quoted region.
931         * The default value is not to ignore anything.
932         *
933         * @return the ignored matcher in use
934         */
935        public StrMatcher getIgnoredMatcher() {
936            return ignoredMatcher;
937        }
938    
939        /**
940         * Set the matcher for characters to ignore.
941         * <p>
942         * These characters are ignored when parsing the String, unless they are
943         * within a quoted region.
944         *
945         * @param ignored  the ignored matcher to use, null ignored
946         * @return this, to enable chaining
947         */
948        public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
949            if (ignored != null) {
950                this.ignoredMatcher = ignored;
951            }
952            return this;
953        }
954    
955        /**
956         * Set the character to ignore.
957         * <p>
958         * This character is ignored when parsing the String, unless it is
959         * within a quoted region.
960         *
961         * @param ignored  the ignored character to use
962         * @return this, to enable chaining
963         */
964        public StrTokenizer setIgnoredChar(char ignored) {
965            return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
966        }
967    
968        // Trimmer
969        //-----------------------------------------------------------------------
970        /**
971         * Gets the trimmer character matcher.
972         * <p>
973         * These characters are trimmed off on each side of the delimiter
974         * until the token or quote is found.
975         * The default value is not to trim anything.
976         *
977         * @return the trimmer matcher in use
978         */
979        public StrMatcher getTrimmerMatcher() {
980            return trimmerMatcher;
981        }
982    
983        /**
984         * Sets the matcher for characters to trim.
985         * <p>
986         * These characters are trimmed off on each side of the delimiter
987         * until the token or quote is found.
988         *
989         * @param trimmer  the trimmer matcher to use, null ignored
990         * @return this, to enable chaining
991         */
992        public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
993            if (trimmer != null) {
994                this.trimmerMatcher = trimmer;
995            }
996            return this;
997        }
998    
999        //-----------------------------------------------------------------------
1000        /**
1001         * Gets whether the tokenizer currently returns empty tokens as null.
1002         * The default for this property is false.
1003         *
1004         * @return true if empty tokens are returned as null
1005         */
1006        public boolean isEmptyTokenAsNull() {
1007            return this.emptyAsNull;
1008        }
1009    
1010        /**
1011         * Sets whether the tokenizer should return empty tokens as null.
1012         * The default for this property is false.
1013         *
1014         * @param emptyAsNull  whether empty tokens are returned as null
1015         * @return this, to enable chaining
1016         */
1017        public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1018            this.emptyAsNull = emptyAsNull;
1019            return this;
1020        }
1021    
1022        //-----------------------------------------------------------------------
1023        /**
1024         * Gets whether the tokenizer currently ignores empty tokens.
1025         * The default for this property is true.
1026         *
1027         * @return true if empty tokens are not returned
1028         */
1029        public boolean isIgnoreEmptyTokens() {
1030            return ignoreEmptyTokens;
1031        }
1032    
1033        /**
1034         * Sets whether the tokenizer should ignore and not return empty tokens.
1035         * The default for this property is true.
1036         *
1037         * @param ignoreEmptyTokens  whether empty tokens are not returned
1038         * @return this, to enable chaining
1039         */
1040        public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1041            this.ignoreEmptyTokens = ignoreEmptyTokens;
1042            return this;
1043        }
1044    
1045        //-----------------------------------------------------------------------
1046        /**
1047         * Gets the String content that the tokenizer is parsing.
1048         *
1049         * @return the string content being parsed
1050         */
1051        public String getContent() {
1052            if (chars == null) {
1053                return null;
1054            }
1055            return new String(chars);
1056        }
1057    
1058        //-----------------------------------------------------------------------
1059        /**
1060         * Creates a new instance of this Tokenizer. The new instance is reset so
1061         * that it will be at the start of the token list.
1062         * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1063         * 
1064         * @return a new instance of this Tokenizer which has been reset.
1065         */
1066        @Override
1067        public Object clone() {
1068            try {
1069                return cloneReset();
1070            } catch (CloneNotSupportedException ex) {
1071                return null;
1072            }
1073        }
1074    
1075        /**
1076         * Creates a new instance of this Tokenizer. The new instance is reset so that
1077         * it will be at the start of the token list.
1078         * 
1079         * @return a new instance of this Tokenizer which has been reset.
1080         * @throws CloneNotSupportedException if there is a problem cloning
1081         */
1082        Object cloneReset() throws CloneNotSupportedException {
1083            // this method exists to enable 100% test coverage
1084            StrTokenizer cloned = (StrTokenizer) super.clone();
1085            if (cloned.chars != null) {
1086                cloned.chars = cloned.chars.clone();
1087            }
1088            cloned.reset();
1089            return cloned;
1090        }
1091    
1092        //-----------------------------------------------------------------------
1093        /**
1094         * Gets the String content that the tokenizer is parsing.
1095         *
1096         * @return the string content being parsed
1097         */
1098        @Override
1099        public String toString() {
1100            if (tokens == null) {
1101                return "StrTokenizer[not tokenized yet]";
1102            }
1103            return "StrTokenizer" + getTokenList();
1104        }
1105    
1106    }