001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang3.text;
018    
019    import java.util.ArrayList;
020    import java.util.Collections;
021    import java.util.List;
022    import java.util.ListIterator;
023    import java.util.NoSuchElementException;
024    
025    import org.apache.commons.lang3.ArrayUtils;
026    
027    /**
028     * Tokenizes a string based based on delimiters (separators)
029     * and supporting quoting and ignored character concepts.
030     * <p>
031     * This class can split a String into many smaller strings. It aims
032     * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
033     * however it offers much more control and flexibility including implementing
034     * the <code>ListIterator</code> interface. By default, it is set up
035     * like <code>StringTokenizer</code>.
036     * <p>
037     * The input String is split into a number of <i>tokens</i>.
038     * Each token is separated from the next String by a <i>delimiter</i>.
039     * One or more delimiter characters must be specified.
040     * <p>
041     * Each token may be surrounded by quotes.
042     * The <i>quote</i> matcher specifies the quote character(s).
043     * A quote may be escaped within a quoted section by duplicating itself.
044     * <p>
045     * Between each token and the delimiter are potentially characters that need trimming.
046     * The <i>trimmer</i> matcher specifies these characters.
047     * One usage might be to trim whitespace characters.
048     * <p>
049     * At any point outside the quotes there might potentially be invalid characters.
050     * The <i>ignored</i> matcher specifies these characters to be removed.
051     * One usage might be to remove new line characters.
052     * <p>
053     * Empty tokens may be removed or returned as null.
054     * <pre>
055     * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
056     * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
057     * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
058     * </pre>
059     * <p>
060     *
061     * This tokenizer has the following properties and options:
062     *
063     * <table>
064     *  <tr>
065     *   <th>Property</th><th>Type</th><th>Default</th>
066     *  </tr>
067     *  <tr>
068     *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069     *  </tr>
070     *  <tr>
071     *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
072     *  </tr>
073     *  <tr>
074     *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075     *  </tr>
076     *  <tr>
077     *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078     *  </tr>
079     *  <tr>
080     *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081     *  </tr>
082     * </table>
083     *
084     * @author Apache Software Foundation
085     * @author Matthew Inger
086     * @author Gary D. Gregory
087     * @since 2.2
088     * @version $Id: StrTokenizer.java 907630 2010-02-08 12:22:32Z sebb $
089     */
090    public class StrTokenizer implements ListIterator<String>, Cloneable {
091    
092        private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
093        private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
094        static {
095            CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
096            CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
097            CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
098            CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
099            CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
100            CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
101            CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
102    
103            TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
104            TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
105            TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
106            TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
107            TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
108            TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
109            TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
110        }
111    
112        /** The text to work on. */
113        private char chars[];
114        /** The parsed tokens */
115        private String tokens[];
116        /** The current iteration position */
117        private int tokenPos;
118    
119        /** The delimiter matcher */
120        private StrMatcher delimMatcher = StrMatcher.splitMatcher();
121        /** The quote matcher */
122        private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
123        /** The ignored matcher */
124        private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
125        /** The trimmer matcher */
126        private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
127    
128        /** Whether to return empty tokens as null */
129        private boolean emptyAsNull = false;
130        /** Whether to ignore empty tokens */
131        private boolean ignoreEmptyTokens = true;
132    
133        //-----------------------------------------------------------------------
134    
135        /**
136         * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137         * 
138         * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
139         */
140        private static StrTokenizer getCSVClone() {
141            return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
142        }
143    
144        /**
145         * Gets a new tokenizer instance which parses Comma Separated Value strings
146         * initializing it with the given input.  The default for CSV processing
147         * will be trim whitespace from both ends (which can be overridden with
148         * the setTrimmer method).
149         * <p>
150         * You must call a "reset" method to set the string which you want to parse.
151         * @return a new tokenizer instance which parses Comma Separated Value strings
152         */
153        public static StrTokenizer getCSVInstance() {
154            return getCSVClone();
155        }
156    
157        /**
158         * Gets a new tokenizer instance which parses Comma Separated Value strings
159         * initializing it with the given input.  The default for CSV processing
160         * will be trim whitespace from both ends (which can be overridden with
161         * the setTrimmer method).
162         *
163         * @param input  the text to parse
164         * @return a new tokenizer instance which parses Comma Separated Value strings
165         */
166        public static StrTokenizer getCSVInstance(String input) {
167            StrTokenizer tok = getCSVClone();
168            tok.reset(input);
169            return tok;
170        }
171    
172        /**
173         * Gets a new tokenizer instance which parses Comma Separated Value strings
174         * initializing it with the given input.  The default for CSV processing
175         * will be trim whitespace from both ends (which can be overridden with
176         * the setTrimmer method).
177         *
178         * @param input  the text to parse
179         * @return a new tokenizer instance which parses Comma Separated Value strings
180         */
181        public static StrTokenizer getCSVInstance(char[] input) {
182            StrTokenizer tok = getCSVClone();
183            tok.reset(input);
184            return tok;
185        }
186    
187        /**
188         * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189         * 
190         * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
191         */
192        private static StrTokenizer getTSVClone() {
193            return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
194        }
195    
196    
197        /**
198         * Gets a new tokenizer instance which parses Tab Separated Value strings.
199         * The default for CSV processing will be trim whitespace from both ends
200         * (which can be overridden with the setTrimmer method).
201         * <p>
202         * You must call a "reset" method to set the string which you want to parse.
203         * @return a new tokenizer instance which parses Tab Separated Value strings.
204         */
205        public static StrTokenizer getTSVInstance() {
206            return getTSVClone();
207        }
208    
209        /**
210         * Gets a new tokenizer instance which parses Tab Separated Value strings.
211         * The default for CSV processing will be trim whitespace from both ends
212         * (which can be overridden with the setTrimmer method).
213         * @param input  the string to parse
214         * @return a new tokenizer instance which parses Tab Separated Value strings.
215         */
216        public static StrTokenizer getTSVInstance(String input) {
217            StrTokenizer tok = getTSVClone();
218            tok.reset(input);
219            return tok;
220        }
221    
222        /**
223         * Gets a new tokenizer instance which parses Tab Separated Value strings.
224         * The default for CSV processing will be trim whitespace from both ends
225         * (which can be overridden with the setTrimmer method).
226         * @param input  the string to parse
227         * @return a new tokenizer instance which parses Tab Separated Value strings.
228         */
229        public static StrTokenizer getTSVInstance(char[] input) {
230            StrTokenizer tok = getTSVClone();
231            tok.reset(input);
232            return tok;
233        }
234    
235        //-----------------------------------------------------------------------
236        /**
237         * Constructs a tokenizer splitting on space, tab, newline and formfeed
238         * as per StringTokenizer, but with no text to tokenize.
239         * <p>
240         * This constructor is normally used with {@link #reset(String)}.
241         */
242        public StrTokenizer() {
243            super();
244            this.chars = null;
245        }
246    
247        /**
248         * Constructs a tokenizer splitting on space, tab, newline and formfeed
249         * as per StringTokenizer.
250         *
251         * @param input  the string which is to be parsed
252         */
253        public StrTokenizer(String input) {
254            super();
255            if (input != null) {
256                chars = input.toCharArray();
257            } else {
258                chars = null;
259            }
260        }
261    
262        /**
263         * Constructs a tokenizer splitting on the specified delimiter character.
264         *
265         * @param input  the string which is to be parsed
266         * @param delim  the field delimiter character
267         */
268        public StrTokenizer(String input, char delim) {
269            this(input);
270            setDelimiterChar(delim);
271        }
272    
273        /**
274         * Constructs a tokenizer splitting on the specified delimiter string.
275         *
276         * @param input  the string which is to be parsed
277         * @param delim  the field delimiter string
278         */
279        public StrTokenizer(String input, String delim) {
280            this(input);
281            setDelimiterString(delim);
282        }
283    
284        /**
285         * Constructs a tokenizer splitting using the specified delimiter matcher.
286         *
287         * @param input  the string which is to be parsed
288         * @param delim  the field delimiter matcher
289         */
290        public StrTokenizer(String input, StrMatcher delim) {
291            this(input);
292            setDelimiterMatcher(delim);
293        }
294    
295        /**
296         * Constructs a tokenizer splitting on the specified delimiter character
297         * and handling quotes using the specified quote character.
298         *
299         * @param input  the string which is to be parsed
300         * @param delim  the field delimiter character
301         * @param quote  the field quoted string character
302         */
303        public StrTokenizer(String input, char delim, char quote) {
304            this(input, delim);
305            setQuoteChar(quote);
306        }
307    
308        /**
309         * Constructs a tokenizer splitting using the specified delimiter matcher
310         * and handling quotes using the specified quote matcher.
311         *
312         * @param input  the string which is to be parsed
313         * @param delim  the field delimiter matcher
314         * @param quote  the field quoted string matcher
315         */
316        public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
317            this(input, delim);
318            setQuoteMatcher(quote);
319        }
320    
321        /**
322         * Constructs a tokenizer splitting on space, tab, newline and formfeed
323         * as per StringTokenizer.
324         *
325         * @param input  the string which is to be parsed, not cloned
326         */
327        public StrTokenizer(char[] input) {
328            super();
329            this.chars = ArrayUtils.clone(input);
330        }
331    
332        /**
333         * Constructs a tokenizer splitting on the specified character.
334         *
335         * @param input  the string which is to be parsed, not cloned
336         * @param delim the field delimiter character
337         */
338        public StrTokenizer(char[] input, char delim) {
339            this(input);
340            setDelimiterChar(delim);
341        }
342    
343        /**
344         * Constructs a tokenizer splitting on the specified string.
345         *
346         * @param input  the string which is to be parsed, not cloned
347         * @param delim the field delimiter string
348         */
349        public StrTokenizer(char[] input, String delim) {
350            this(input);
351            setDelimiterString(delim);
352        }
353    
354        /**
355         * Constructs a tokenizer splitting using the specified delimiter matcher.
356         *
357         * @param input  the string which is to be parsed, not cloned
358         * @param delim  the field delimiter matcher
359         */
360        public StrTokenizer(char[] input, StrMatcher delim) {
361            this(input);
362            setDelimiterMatcher(delim);
363        }
364    
365        /**
366         * Constructs a tokenizer splitting on the specified delimiter character
367         * and handling quotes using the specified quote character.
368         *
369         * @param input  the string which is to be parsed, not cloned
370         * @param delim  the field delimiter character
371         * @param quote  the field quoted string character
372         */
373        public StrTokenizer(char[] input, char delim, char quote) {
374            this(input, delim);
375            setQuoteChar(quote);
376        }
377    
378        /**
379         * Constructs a tokenizer splitting using the specified delimiter matcher
380         * and handling quotes using the specified quote matcher.
381         *
382         * @param input  the string which is to be parsed, not cloned
383         * @param delim  the field delimiter character
384         * @param quote  the field quoted string character
385         */
386        public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
387            this(input, delim);
388            setQuoteMatcher(quote);
389        }
390    
391        // API
392        //-----------------------------------------------------------------------
393        /**
394         * Gets the number of tokens found in the String.
395         *
396         * @return the number of matched tokens
397         */
398        public int size() {
399            checkTokenized();
400            return tokens.length;
401        }
402    
403        /**
404         * Gets the next token from the String.
405         * Equivalent to {@link #next()} except it returns null rather than
406         * throwing {@link NoSuchElementException} when no tokens remain.
407         *
408         * @return the next sequential token, or null when no more tokens are found
409         */
410        public String nextToken() {
411            if (hasNext()) {
412                return tokens[tokenPos++];
413            }
414            return null;
415        }
416    
417        /**
418         * Gets the previous token from the String.
419         *
420         * @return the previous sequential token, or null when no more tokens are found
421         */
422        public String previousToken() {
423            if (hasPrevious()) {
424                return tokens[--tokenPos];
425            }
426            return null;
427        }
428    
429        /**
430         * Gets a copy of the full token list as an independent modifiable array.
431         *
432         * @return the tokens as a String array
433         */
434        public String[] getTokenArray() {
435            checkTokenized();
436            return tokens.clone();
437        }
438    
439        /**
440         * Gets a copy of the full token list as an independent modifiable list.
441         *
442         * @return the tokens as a String array
443         */
444        public List<String> getTokenList() {
445            checkTokenized();
446            List<String> list = new ArrayList<String>(tokens.length);
447            for (String element : tokens) {
448                list.add(element);
449            }
450            return list;
451        }
452    
453        /**
454         * Resets this tokenizer, forgetting all parsing and iteration already completed.
455         * <p>
456         * This method allows the same tokenizer to be reused for the same String.
457         *
458         * @return this, to enable chaining
459         */
460        public StrTokenizer reset() {
461            tokenPos = 0;
462            tokens = null;
463            return this;
464        }
465    
466        /**
467         * Reset this tokenizer, giving it a new input string to parse.
468         * In this manner you can re-use a tokenizer with the same settings
469         * on multiple input lines.
470         *
471         * @param input  the new string to tokenize, null sets no text to parse
472         * @return this, to enable chaining
473         */
474        public StrTokenizer reset(String input) {
475            reset();
476            if (input != null) {
477                this.chars = input.toCharArray();
478            } else {
479                this.chars = null;
480            }
481            return this;
482        }
483    
484        /**
485         * Reset this tokenizer, giving it a new input string to parse.
486         * In this manner you can re-use a tokenizer with the same settings
487         * on multiple input lines.
488         *
489         * @param input  the new character array to tokenize, not cloned, null sets no text to parse
490         * @return this, to enable chaining
491         */
492        public StrTokenizer reset(char[] input) {
493            reset();
494            this.chars = ArrayUtils.clone(input);
495            return this;
496        }
497    
498        // ListIterator
499        //-----------------------------------------------------------------------
500        /**
501         * Checks whether there are any more tokens.
502         *
503         * @return true if there are more tokens
504         */
505        public boolean hasNext() {
506            checkTokenized();
507            return tokenPos < tokens.length;
508        }
509    
510        /**
511         * Gets the next token.
512         *
513         * @return the next String token
514         * @throws NoSuchElementException if there are no more elements
515         */
516        public String next() {
517            if (hasNext()) {
518                return tokens[tokenPos++];
519            }
520            throw new NoSuchElementException();
521        }
522    
523        /**
524         * Gets the index of the next token to return.
525         *
526         * @return the next token index
527         */
528        public int nextIndex() {
529            return tokenPos;
530        }
531    
532        /**
533         * Checks whether there are any previous tokens that can be iterated to.
534         *
535         * @return true if there are previous tokens
536         */
537        public boolean hasPrevious() {
538            checkTokenized();
539            return tokenPos > 0;
540        }
541    
542        /**
543         * Gets the token previous to the last returned token.
544         *
545         * @return the previous token
546         */
547        public String previous() {
548            if (hasPrevious()) {
549                return tokens[--tokenPos];
550            }
551            throw new NoSuchElementException();
552        }
553    
554        /**
555         * Gets the index of the previous token.
556         *
557         * @return the previous token index
558         */
559        public int previousIndex() {
560            return tokenPos - 1;
561        }
562    
563        /**
564         * Unsupported ListIterator operation.
565         *
566         * @throws UnsupportedOperationException always
567         */
568        public void remove() {
569            throw new UnsupportedOperationException("remove() is unsupported");
570        }
571    
572        /**
573         * Unsupported ListIterator operation.
574         * @param obj this parameter ignored.
575         * @throws UnsupportedOperationException always
576         */
577        public void set(String obj) {
578            throw new UnsupportedOperationException("set() is unsupported");
579        }
580    
581        /**
582         * Unsupported ListIterator operation.
583         * @param obj this parameter ignored.
584         * @throws UnsupportedOperationException always
585         */
586        public void add(String obj) {
587            throw new UnsupportedOperationException("add() is unsupported");
588        }
589    
590        // Implementation
591        //-----------------------------------------------------------------------
592        /**
593         * Checks if tokenization has been done, and if not then do it.
594         */
595        private void checkTokenized() {
596            if (tokens == null) {
597                if (chars == null) {
598                    // still call tokenize as subclass may do some work
599                    List<String> split = tokenize(null, 0, 0);
600                    tokens = split.toArray(new String[split.size()]);
601                } else {
602                    List<String> split = tokenize(chars, 0, chars.length);
603                    tokens = split.toArray(new String[split.size()]);
604                }
605            }
606        }
607    
608        /**
609         * Internal method to performs the tokenization.
610         * <p>
611         * Most users of this class do not need to call this method. This method
612         * will be called automatically by other (public) methods when required.
613         * <p>
614         * This method exists to allow subclasses to add code before or after the
615         * tokenization. For example, a subclass could alter the character array,
616         * offset or count to be parsed, or call the tokenizer multiple times on
617         * multiple strings. It is also be possible to filter the results.
618         * <p>
619         * <code>StrTokenizer</code> will always pass a zero offset and a count
620         * equal to the length of the array to this method, however a subclass
621         * may pass other values, or even an entirely different array.
622         * 
623         * @param chars  the character array being tokenized, may be null
624         * @param offset  the start position within the character array, must be valid
625         * @param count  the number of characters to tokenize, must be valid
626         * @return the modifiable list of String tokens, unmodifiable if null array or zero count
627         */
628        protected List<String> tokenize(char[] chars, int offset, int count) {
629            if (chars == null || count == 0) {
630                return Collections.emptyList();
631            }
632            StrBuilder buf = new StrBuilder();
633            List<String> tokens = new ArrayList<String>();
634            int pos = offset;
635            
636            // loop around the entire buffer
637            while (pos >= 0 && pos < count) {
638                // find next token
639                pos = readNextToken(chars, pos, count, buf, tokens);
640                
641                // handle case where end of string is a delimiter
642                if (pos >= count) {
643                    addToken(tokens, "");
644                }
645            }
646            return tokens;
647        }
648    
649        /**
650         * Adds a token to a list, paying attention to the parameters we've set.
651         *
652         * @param list  the list to add to
653         * @param tok  the token to add
654         */
655        private void addToken(List<String> list, String tok) {
656            if (tok == null || tok.length() == 0) {
657                if (isIgnoreEmptyTokens()) {
658                    return;
659                }
660                if (isEmptyTokenAsNull()) {
661                    tok = null;
662                }
663            }
664            list.add(tok);
665        }
666    
667        /**
668         * Reads character by character through the String to get the next token.
669         *
670         * @param chars  the character array being tokenized
671         * @param start  the first character of field
672         * @param len  the length of the character array being tokenized
673         * @param workArea  a temporary work area
674         * @param tokens  the list of parsed tokens
675         * @return the starting position of the next field (the character
676         *  immediately after the delimiter), or -1 if end of string found
677         */
678        private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) {
679            // skip all leading whitespace, unless it is the
680            // field delimiter or the quote character
681            while (start < len) {
682                int removeLen = Math.max(
683                        getIgnoredMatcher().isMatch(chars, start, start, len),
684                        getTrimmerMatcher().isMatch(chars, start, start, len));
685                if (removeLen == 0 ||
686                    getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
687                    getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
688                    break;
689                }
690                start += removeLen;
691            }
692            
693            // handle reaching end
694            if (start >= len) {
695                addToken(tokens, "");
696                return -1;
697            }
698            
699            // handle empty token
700            int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
701            if (delimLen > 0) {
702                addToken(tokens, "");
703                return start + delimLen;
704            }
705            
706            // handle found token
707            int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
708            if (quoteLen > 0) {
709                return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
710            }
711            return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
712        }
713    
714        /**
715         * Reads a possibly quoted string token.
716         *
717         * @param chars  the character array being tokenized
718         * @param start  the first character of field
719         * @param len  the length of the character array being tokenized
720         * @param workArea  a temporary work area
721         * @param tokens  the list of parsed tokens
722         * @param quoteStart  the start position of the matched quote, 0 if no quoting
723         * @param quoteLen  the length of the matched quote, 0 if no quoting
724         * @return the starting position of the next field (the character
725         *  immediately after the delimiter, or if end of string found,
726         *  then the length of string
727         */
728        private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
729                                   List<String> tokens, int quoteStart, int quoteLen) 
730        {
731            // Loop until we've found the end of the quoted
732            // string or the end of the input
733            workArea.clear();
734            int pos = start;
735            boolean quoting = (quoteLen > 0);
736            int trimStart = 0;
737            
738            while (pos < len) {
739                // quoting mode can occur several times throughout a string
740                // we must switch between quoting and non-quoting until we
741                // encounter a non-quoted delimiter, or end of string
742                if (quoting) {
743                    // In quoting mode
744                    
745                    // If we've found a quote character, see if it's
746                    // followed by a second quote.  If so, then we need
747                    // to actually put the quote character into the token
748                    // rather than end the token.
749                    if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
750                        if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
751                            // matched pair of quotes, thus an escaped quote
752                            workArea.append(chars, pos, quoteLen);
753                            pos += (quoteLen * 2);
754                            trimStart = workArea.size();
755                            continue;
756                        }
757                        
758                        // end of quoting
759                        quoting = false;
760                        pos += quoteLen;
761                        continue;
762                    }
763                    
764                    // copy regular character from inside quotes
765                    workArea.append(chars[pos++]);
766                    trimStart = workArea.size();
767                    
768                } else {
769                    // Not in quoting mode
770                    
771                    // check for delimiter, and thus end of token
772                    int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
773                    if (delimLen > 0) {
774                        // return condition when end of token found
775                        addToken(tokens, workArea.substring(0, trimStart));
776                        return pos + delimLen;
777                    }
778                    
779                    // check for quote, and thus back into quoting mode
780                    if (quoteLen > 0) {
781                        if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
782                            quoting = true;
783                            pos += quoteLen;
784                            continue;
785                        }
786                    }
787                    
788                    // check for ignored (outside quotes), and ignore
789                    int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
790                    if (ignoredLen > 0) {
791                        pos += ignoredLen;
792                        continue;
793                    }
794                    
795                    // check for trimmed character
796                    // don't yet know if its at the end, so copy to workArea
797                    // use trimStart to keep track of trim at the end
798                    int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
799                    if (trimmedLen > 0) {
800                        workArea.append(chars, pos, trimmedLen);
801                        pos += trimmedLen;
802                        continue;
803                    }
804                    
805                    // copy regular character from outside quotes
806                    workArea.append(chars[pos++]);
807                    trimStart = workArea.size();
808                }
809            }
810            
811            // return condition when end of string found
812            addToken(tokens, workArea.substring(0, trimStart));
813            return -1;
814        }
815    
816        /**
817         * Checks if the characters at the index specified match the quote
818         * already matched in readNextToken().
819         *
820         * @param chars  the character array being tokenized
821         * @param pos  the position to check for a quote
822         * @param len  the length of the character array being tokenized
823         * @param quoteStart  the start position of the matched quote, 0 if no quoting
824         * @param quoteLen  the length of the matched quote, 0 if no quoting
825         * @return true if a quote is matched
826         */
827        private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
828            for (int i = 0; i < quoteLen; i++) {
829                if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
830                    return false;
831                }
832            }
833            return true;
834        }
835    
836        // Delimiter
837        //-----------------------------------------------------------------------
838        /**
839         * Gets the field delimiter matcher.
840         *
841         * @return the delimiter matcher in use
842         */
843        public StrMatcher getDelimiterMatcher() {
844            return this.delimMatcher;
845        }
846    
847        /**
848         * Sets the field delimiter matcher.
849         * <p>
850         * The delimitier is used to separate one token from another.
851         *
852         * @param delim  the delimiter matcher to use
853         * @return this, to enable chaining
854         */
855        public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
856            if (delim == null) {
857                this.delimMatcher = StrMatcher.noneMatcher();
858            } else {
859                this.delimMatcher = delim;
860            }
861            return this;
862        }
863    
864        /**
865         * Sets the field delimiter character.
866         *
867         * @param delim  the delimiter character to use
868         * @return this, to enable chaining
869         */
870        public StrTokenizer setDelimiterChar(char delim) {
871            return setDelimiterMatcher(StrMatcher.charMatcher(delim));
872        }
873    
874        /**
875         * Sets the field delimiter string.
876         *
877         * @param delim  the delimiter string to use
878         * @return this, to enable chaining
879         */
880        public StrTokenizer setDelimiterString(String delim) {
881            return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
882        }
883    
884        // Quote
885        //-----------------------------------------------------------------------
886        /**
887         * Gets the quote matcher currently in use.
888         * <p>
889         * The quote character is used to wrap data between the tokens.
890         * This enables delimiters to be entered as data.
891         * The default value is '"' (double quote).
892         *
893         * @return the quote matcher in use
894         */
895        public StrMatcher getQuoteMatcher() {
896            return quoteMatcher;
897        }
898    
899        /**
900         * Set the quote matcher to use.
901         * <p>
902         * The quote character is used to wrap data between the tokens.
903         * This enables delimiters to be entered as data.
904         *
905         * @param quote  the quote matcher to use, null ignored
906         * @return this, to enable chaining
907         */
908        public StrTokenizer setQuoteMatcher(StrMatcher quote) {
909            if (quote != null) {
910                this.quoteMatcher = quote;
911            }
912            return this;
913        }
914    
915        /**
916         * Sets the quote character to use.
917         * <p>
918         * The quote character is used to wrap data between the tokens.
919         * This enables delimiters to be entered as data.
920         *
921         * @param quote  the quote character to use
922         * @return this, to enable chaining
923         */
924        public StrTokenizer setQuoteChar(char quote) {
925            return setQuoteMatcher(StrMatcher.charMatcher(quote));
926        }
927    
928        // Ignored
929        //-----------------------------------------------------------------------
930        /**
931         * Gets the ignored character matcher.
932         * <p>
933         * These characters are ignored when parsing the String, unless they are
934         * within a quoted region.
935         * The default value is not to ignore anything.
936         *
937         * @return the ignored matcher in use
938         */
939        public StrMatcher getIgnoredMatcher() {
940            return ignoredMatcher;
941        }
942    
943        /**
944         * Set the matcher for characters to ignore.
945         * <p>
946         * These characters are ignored when parsing the String, unless they are
947         * within a quoted region.
948         *
949         * @param ignored  the ignored matcher to use, null ignored
950         * @return this, to enable chaining
951         */
952        public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
953            if (ignored != null) {
954                this.ignoredMatcher = ignored;
955            }
956            return this;
957        }
958    
959        /**
960         * Set the character to ignore.
961         * <p>
962         * This character is ignored when parsing the String, unless it is
963         * within a quoted region.
964         *
965         * @param ignored  the ignored character to use
966         * @return this, to enable chaining
967         */
968        public StrTokenizer setIgnoredChar(char ignored) {
969            return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
970        }
971    
972        // Trimmer
973        //-----------------------------------------------------------------------
974        /**
975         * Gets the trimmer character matcher.
976         * <p>
977         * These characters are trimmed off on each side of the delimiter
978         * until the token or quote is found.
979         * The default value is not to trim anything.
980         *
981         * @return the trimmer matcher in use
982         */
983        public StrMatcher getTrimmerMatcher() {
984            return trimmerMatcher;
985        }
986    
987        /**
988         * Sets the matcher for characters to trim.
989         * <p>
990         * These characters are trimmed off on each side of the delimiter
991         * until the token or quote is found.
992         *
993         * @param trimmer  the trimmer matcher to use, null ignored
994         * @return this, to enable chaining
995         */
996        public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
997            if (trimmer != null) {
998                this.trimmerMatcher = trimmer;
999            }
1000            return this;
1001        }
1002    
1003        //-----------------------------------------------------------------------
1004        /**
1005         * Gets whether the tokenizer currently returns empty tokens as null.
1006         * The default for this property is false.
1007         *
1008         * @return true if empty tokens are returned as null
1009         */
1010        public boolean isEmptyTokenAsNull() {
1011            return this.emptyAsNull;
1012        }
1013    
1014        /**
1015         * Sets whether the tokenizer should return empty tokens as null.
1016         * The default for this property is false.
1017         *
1018         * @param emptyAsNull  whether empty tokens are returned as null
1019         * @return this, to enable chaining
1020         */
1021        public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1022            this.emptyAsNull = emptyAsNull;
1023            return this;
1024        }
1025    
1026        //-----------------------------------------------------------------------
1027        /**
1028         * Gets whether the tokenizer currently ignores empty tokens.
1029         * The default for this property is true.
1030         *
1031         * @return true if empty tokens are not returned
1032         */
1033        public boolean isIgnoreEmptyTokens() {
1034            return ignoreEmptyTokens;
1035        }
1036    
1037        /**
1038         * Sets whether the tokenizer should ignore and not return empty tokens.
1039         * The default for this property is true.
1040         *
1041         * @param ignoreEmptyTokens  whether empty tokens are not returned
1042         * @return this, to enable chaining
1043         */
1044        public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1045            this.ignoreEmptyTokens = ignoreEmptyTokens;
1046            return this;
1047        }
1048    
1049        //-----------------------------------------------------------------------
1050        /**
1051         * Gets the String content that the tokenizer is parsing.
1052         *
1053         * @return the string content being parsed
1054         */
1055        public String getContent() {
1056            if (chars == null) {
1057                return null;
1058            }
1059            return new String(chars);
1060        }
1061    
1062        //-----------------------------------------------------------------------
1063        /**
1064         * Creates a new instance of this Tokenizer. The new instance is reset so
1065         * that it will be at the start of the token list.
1066         * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1067         * 
1068         * @return a new instance of this Tokenizer which has been reset.
1069         */
1070        @Override
1071        public Object clone() {
1072            try {
1073                return cloneReset();
1074            } catch (CloneNotSupportedException ex) {
1075                return null;
1076            }
1077        }
1078    
1079        /**
1080         * Creates a new instance of this Tokenizer. The new instance is reset so that
1081         * it will be at the start of the token list.
1082         * 
1083         * @return a new instance of this Tokenizer which has been reset.
1084         * @throws CloneNotSupportedException if there is a problem cloning
1085         */
1086        Object cloneReset() throws CloneNotSupportedException {
1087            // this method exists to enable 100% test coverage
1088            StrTokenizer cloned = (StrTokenizer) super.clone();
1089            if (cloned.chars != null) {
1090                cloned.chars = cloned.chars.clone();
1091            }
1092            cloned.reset();
1093            return cloned;
1094        }
1095    
1096        //-----------------------------------------------------------------------
1097        /**
1098         * Gets the String content that the tokenizer is parsing.
1099         *
1100         * @return the string content being parsed
1101         */
1102        @Override
1103        public String toString() {
1104            if (tokens == null) {
1105                return "StrTokenizer[not tokenized yet]";
1106            }
1107            return "StrTokenizer" + getTokenList();
1108        }
1109    
1110    }