001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang.text;
018    
019    import java.util.ArrayList;
020    import java.util.Collections;
021    import java.util.List;
022    import java.util.ListIterator;
023    import java.util.NoSuchElementException;
024    
025    /**
026     * Tokenizes a string based based on delimiters (separators)
027     * and supporting quoting and ignored character concepts.
028     * <p>
029     * This class can split a String into many smaller strings. It aims
030     * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
031     * however it offers much more control and flexibility including implementing
032     * the <code>ListIterator</code> interface. By default, it is set up
033     * like <code>StringTokenizer</code>.
034     * <p>
035     * The input String is split into a number of <i>tokens</i>.
036     * Each token is separated from the next String by a <i>delimiter</i>.
037     * One or more delimiter characters must be specified.
038     * <p>
039     * Each token may be surrounded by quotes.
040     * The <i>quote</i> matcher specifies the quote character(s).
041     * A quote may be escaped within a quoted section by duplicating itself.
042     * <p>
043     * Between each token and the delimiter are potentially characters that need trimming.
044     * The <i>trimmer</i> matcher specifies these characters.
045     * One usage might be to trim whitespace characters.
046     * <p>
047     * At any point outside the quotes there might potentially be invalid characters.
048     * The <i>ignored</i> matcher specifies these characters to be removed.
049     * One usage might be to remove new line characters.
050     * <p>
051     * Empty tokens may be removed or returned as null.
052     * <pre>
053     * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054     * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055     * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056     * </pre>
057     * <p>
058     *
059     * This tokenizer has the following properties and options:
060     *
061     * <table>
062     *  <tr>
063     *   <th>Property</th><th>Type</th><th>Default</th>
064     *  </tr>
065     *  <tr>
066     *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
067     *  </tr>
068     *  <tr>
069     *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
070     *  </tr>
071     *  <tr>
072     *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
073     *  </tr>
074     *  <tr>
075     *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
076     *  </tr>
077     *  <tr>
078     *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
079     *  </tr>
080     * </table>
081     *
082     * @author Apache Software Foundation
083     * @author Matthew Inger
084     * @author Gary D. Gregory
085     * @since 2.2
086     * @version $Id: StrTokenizer.java 907631 2010-02-08 12:22:48Z sebb $
087     */
088    public class StrTokenizer implements ListIterator, Cloneable {
089    
090        private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
091        private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
092        static {
093            CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
094            CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
095            CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
096            CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
097            CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
098            CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
099            CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100    
101            TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102            TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103            TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104            TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105            TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106            TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107            TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108        }
109    
110        /** The text to work on. */
111        private char chars[];
112        /** The parsed tokens */
113        private String tokens[];
114        /** The current iteration position */
115        private int tokenPos;
116    
117        /** The delimiter matcher */
118        private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119        /** The quote matcher */
120        private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121        /** The ignored matcher */
122        private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123        /** The trimmer matcher */
124        private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125    
126        /** Whether to return empty tokens as null */
127        private boolean emptyAsNull = false;
128        /** Whether to ignore empty tokens */
129        private boolean ignoreEmptyTokens = true;
130    
131        //-----------------------------------------------------------------------
132    
133        /**
134         * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135         * 
136         * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137         */
138        private static StrTokenizer getCSVClone() {
139            return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140        }
141    
142        /**
143         * Gets a new tokenizer instance which parses Comma Separated Value strings
144         * initializing it with the given input.  The default for CSV processing
145         * will be trim whitespace from both ends (which can be overridden with
146         * the setTrimmer method).
147         * <p>
148         * You must call a "reset" method to set the string which you want to parse.
149         * @return a new tokenizer instance which parses Comma Separated Value strings
150         */
151        public static StrTokenizer getCSVInstance() {
152            return getCSVClone();
153        }
154    
155        /**
156         * Gets a new tokenizer instance which parses Comma Separated Value strings
157         * initializing it with the given input.  The default for CSV processing
158         * will be trim whitespace from both ends (which can be overridden with
159         * the setTrimmer method).
160         *
161         * @param input  the text to parse
162         * @return a new tokenizer instance which parses Comma Separated Value strings
163         */
164        public static StrTokenizer getCSVInstance(String input) {
165            StrTokenizer tok = getCSVClone();
166            tok.reset(input);
167            return tok;
168        }
169    
170        /**
171         * Gets a new tokenizer instance which parses Comma Separated Value strings
172         * initializing it with the given input.  The default for CSV processing
173         * will be trim whitespace from both ends (which can be overridden with
174         * the setTrimmer method).
175         *
176         * @param input  the text to parse
177         * @return a new tokenizer instance which parses Comma Separated Value strings
178         */
179        public static StrTokenizer getCSVInstance(char[] input) {
180            StrTokenizer tok = getCSVClone();
181            tok.reset(input);
182            return tok;
183        }
184    
185        /**
186         * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187         * 
188         * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189         */
190        private static StrTokenizer getTSVClone() {
191            return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192        }
193    
194    
195        /**
196         * Gets a new tokenizer instance which parses Tab Separated Value strings.
197         * The default for CSV processing will be trim whitespace from both ends
198         * (which can be overridden with the setTrimmer method).
199         * <p>
200         * You must call a "reset" method to set the string which you want to parse.
201         * @return a new tokenizer instance which parses Tab Separated Value strings.
202         */
203        public static StrTokenizer getTSVInstance() {
204            return getTSVClone();
205        }
206    
207        /**
208         * Gets a new tokenizer instance which parses Tab Separated Value strings.
209         * The default for CSV processing will be trim whitespace from both ends
210         * (which can be overridden with the setTrimmer method).
211         * @param input  the string to parse
212         * @return a new tokenizer instance which parses Tab Separated Value strings.
213         */
214        public static StrTokenizer getTSVInstance(String input) {
215            StrTokenizer tok = getTSVClone();
216            tok.reset(input);
217            return tok;
218        }
219    
220        /**
221         * Gets a new tokenizer instance which parses Tab Separated Value strings.
222         * The default for CSV processing will be trim whitespace from both ends
223         * (which can be overridden with the setTrimmer method).
224         * @param input  the string to parse
225         * @return a new tokenizer instance which parses Tab Separated Value strings.
226         */
227        public static StrTokenizer getTSVInstance(char[] input) {
228            StrTokenizer tok = getTSVClone();
229            tok.reset(input);
230            return tok;
231        }
232    
233        //-----------------------------------------------------------------------
234        /**
235         * Constructs a tokenizer splitting on space, tab, newline and formfeed
236         * as per StringTokenizer, but with no text to tokenize.
237         * <p>
238         * This constructor is normally used with {@link #reset(String)}.
239         */
240        public StrTokenizer() {
241            super();
242            this.chars = null;
243        }
244    
245        /**
246         * Constructs a tokenizer splitting on space, tab, newline and formfeed
247         * as per StringTokenizer.
248         *
249         * @param input  the string which is to be parsed
250         */
251        public StrTokenizer(String input) {
252            super();
253            if (input != null) {
254                chars = input.toCharArray();
255            } else {
256                chars = null;
257            }
258        }
259    
260        /**
261         * Constructs a tokenizer splitting on the specified delimiter character.
262         *
263         * @param input  the string which is to be parsed
264         * @param delim  the field delimiter character
265         */
266        public StrTokenizer(String input, char delim) {
267            this(input);
268            setDelimiterChar(delim);
269        }
270    
271        /**
272         * Constructs a tokenizer splitting on the specified delimiter string.
273         *
274         * @param input  the string which is to be parsed
275         * @param delim  the field delimiter string
276         */
277        public StrTokenizer(String input, String delim) {
278            this(input);
279            setDelimiterString(delim);
280        }
281    
282        /**
283         * Constructs a tokenizer splitting using the specified delimiter matcher.
284         *
285         * @param input  the string which is to be parsed
286         * @param delim  the field delimiter matcher
287         */
288        public StrTokenizer(String input, StrMatcher delim) {
289            this(input);
290            setDelimiterMatcher(delim);
291        }
292    
293        /**
294         * Constructs a tokenizer splitting on the specified delimiter character
295         * and handling quotes using the specified quote character.
296         *
297         * @param input  the string which is to be parsed
298         * @param delim  the field delimiter character
299         * @param quote  the field quoted string character
300         */
301        public StrTokenizer(String input, char delim, char quote) {
302            this(input, delim);
303            setQuoteChar(quote);
304        }
305    
306        /**
307         * Constructs a tokenizer splitting using the specified delimiter matcher
308         * and handling quotes using the specified quote matcher.
309         *
310         * @param input  the string which is to be parsed
311         * @param delim  the field delimiter matcher
312         * @param quote  the field quoted string matcher
313         */
314        public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
315            this(input, delim);
316            setQuoteMatcher(quote);
317        }
318    
319        /**
320         * Constructs a tokenizer splitting on space, tab, newline and formfeed
321         * as per StringTokenizer.
322         * <p>
323         * The input character array is not cloned, and must not be altered after
324         * passing in to this method.
325         *
326         * @param input  the string which is to be parsed, not cloned
327         */
328        public StrTokenizer(char[] input) {
329            super();
330            this.chars = input;
331        }
332    
333        /**
334         * Constructs a tokenizer splitting on the specified character.
335         * <p>
336         * The input character array is not cloned, and must not be altered after
337         * passing in to this method.
338         *
339         * @param input  the string which is to be parsed, not cloned
340         * @param delim the field delimiter character
341         */
342        public StrTokenizer(char[] input, char delim) {
343            this(input);
344            setDelimiterChar(delim);
345        }
346    
347        /**
348         * Constructs a tokenizer splitting on the specified string.
349         * <p>
350         * The input character array is not cloned, and must not be altered after
351         * passing in to this method.
352         *
353         * @param input  the string which is to be parsed, not cloned
354         * @param delim the field delimiter string
355         */
356        public StrTokenizer(char[] input, String delim) {
357            this(input);
358            setDelimiterString(delim);
359        }
360    
361        /**
362         * Constructs a tokenizer splitting using the specified delimiter matcher.
363         * <p>
364         * The input character array is not cloned, and must not be altered after
365         * passing in to this method.
366         *
367         * @param input  the string which is to be parsed, not cloned
368         * @param delim  the field delimiter matcher
369         */
370        public StrTokenizer(char[] input, StrMatcher delim) {
371            this(input);
372            setDelimiterMatcher(delim);
373        }
374    
375        /**
376         * Constructs a tokenizer splitting on the specified delimiter character
377         * and handling quotes using the specified quote character.
378         * <p>
379         * The input character array is not cloned, and must not be altered after
380         * passing in to this method.
381         *
382         * @param input  the string which is to be parsed, not cloned
383         * @param delim  the field delimiter character
384         * @param quote  the field quoted string character
385         */
386        public StrTokenizer(char[] input, char delim, char quote) {
387            this(input, delim);
388            setQuoteChar(quote);
389        }
390    
391        /**
392         * Constructs a tokenizer splitting using the specified delimiter matcher
393         * and handling quotes using the specified quote matcher.
394         * <p>
395         * The input character array is not cloned, and must not be altered after
396         * passing in to this method.
397         *
398         * @param input  the string which is to be parsed, not cloned
399         * @param delim  the field delimiter character
400         * @param quote  the field quoted string character
401         */
402        public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
403            this(input, delim);
404            setQuoteMatcher(quote);
405        }
406    
407        // API
408        //-----------------------------------------------------------------------
409        /**
410         * Gets the number of tokens found in the String.
411         *
412         * @return the number of matched tokens
413         */
414        public int size() {
415            checkTokenized();
416            return tokens.length;
417        }
418    
419        /**
420         * Gets the next token from the String.
421         * Equivalent to {@link #next()} except it returns null rather than
422         * throwing {@link NoSuchElementException} when no tokens remain.
423         *
424         * @return the next sequential token, or null when no more tokens are found
425         */
426        public String nextToken() {
427            if (hasNext()) {
428                return tokens[tokenPos++];
429            }
430            return null;
431        }
432    
433        /**
434         * Gets the previous token from the String.
435         *
436         * @return the previous sequential token, or null when no more tokens are found
437         */
438        public String previousToken() {
439            if (hasPrevious()) {
440                return tokens[--tokenPos];
441            }
442            return null;
443        }
444    
445        /**
446         * Gets a copy of the full token list as an independent modifiable array.
447         *
448         * @return the tokens as a String array
449         */
450        public String[] getTokenArray() {
451            checkTokenized();
452            return (String[]) tokens.clone();
453        }
454    
455        /**
456         * Gets a copy of the full token list as an independent modifiable list.
457         *
458         * @return the tokens as a String array
459         */
460        public List getTokenList() {
461            checkTokenized();
462            List list = new ArrayList(tokens.length);
463            for (int i = 0; i < tokens.length; i++) {
464                list.add(tokens[i]);
465            }
466            return list;
467        }
468    
469        /**
470         * Resets this tokenizer, forgetting all parsing and iteration already completed.
471         * <p>
472         * This method allows the same tokenizer to be reused for the same String.
473         *
474         * @return this, to enable chaining
475         */
476        public StrTokenizer reset() {
477            tokenPos = 0;
478            tokens = null;
479            return this;
480        }
481    
482        /**
483         * Reset this tokenizer, giving it a new input string to parse.
484         * In this manner you can re-use a tokenizer with the same settings
485         * on multiple input lines.
486         *
487         * @param input  the new string to tokenize, null sets no text to parse
488         * @return this, to enable chaining
489         */
490        public StrTokenizer reset(String input) {
491            reset();
492            if (input != null) {
493                this.chars = input.toCharArray();
494            } else {
495                this.chars = null;
496            }
497            return this;
498        }
499    
500        /**
501         * Reset this tokenizer, giving it a new input string to parse.
502         * In this manner you can re-use a tokenizer with the same settings
503         * on multiple input lines.
504         * <p>
505         * The input character array is not cloned, and must not be altered after
506         * passing in to this method.
507         *
508         * @param input  the new character array to tokenize, not cloned, null sets no text to parse
509         * @return this, to enable chaining
510         */
511        public StrTokenizer reset(char[] input) {
512            reset();
513            this.chars = input;
514            return this;
515        }
516    
517        // ListIterator
518        //-----------------------------------------------------------------------
519        /**
520         * Checks whether there are any more tokens.
521         *
522         * @return true if there are more tokens
523         */
524        public boolean hasNext() {
525            checkTokenized();
526            return tokenPos < tokens.length;
527        }
528    
529        /**
530         * Gets the next token.
531         *
532         * @return the next String token
533         * @throws NoSuchElementException if there are no more elements
534         */
535        public Object next() {
536            if (hasNext()) {
537                return tokens[tokenPos++];
538            }
539            throw new NoSuchElementException();
540        }
541    
542        /**
543         * Gets the index of the next token to return.
544         *
545         * @return the next token index
546         */
547        public int nextIndex() {
548            return tokenPos;
549        }
550    
551        /**
552         * Checks whether there are any previous tokens that can be iterated to.
553         *
554         * @return true if there are previous tokens
555         */
556        public boolean hasPrevious() {
557            checkTokenized();
558            return tokenPos > 0;
559        }
560    
561        /**
562         * Gets the token previous to the last returned token.
563         *
564         * @return the previous token
565         */
566        public Object previous() {
567            if (hasPrevious()) {
568                return tokens[--tokenPos];
569            }
570            throw new NoSuchElementException();
571        }
572    
573        /**
574         * Gets the index of the previous token.
575         *
576         * @return the previous token index
577         */
578        public int previousIndex() {
579            return tokenPos - 1;
580        }
581    
582        /**
583         * Unsupported ListIterator operation.
584         *
585         * @throws UnsupportedOperationException always
586         */
587        public void remove() {
588            throw new UnsupportedOperationException("remove() is unsupported");
589        }
590    
591        /**
592         * Unsupported ListIterator operation.
593         * @param obj this parameter ignored.
594         * @throws UnsupportedOperationException always
595         */
596        public void set(Object obj) {
597            throw new UnsupportedOperationException("set() is unsupported");
598        }
599    
600        /**
601         * Unsupported ListIterator operation.
602         * @param obj this parameter ignored.
603         * @throws UnsupportedOperationException always
604         */
605        public void add(Object obj) {
606            throw new UnsupportedOperationException("add() is unsupported");
607        }
608    
609        // Implementation
610        //-----------------------------------------------------------------------
611        /**
612         * Checks if tokenization has been done, and if not then do it.
613         */
614        private void checkTokenized() {
615            if (tokens == null) {
616                if (chars == null) {
617                    // still call tokenize as subclass may do some work
618                    List split = tokenize(null, 0, 0);
619                    tokens = (String[]) split.toArray(new String[split.size()]);
620                } else {
621                    List split = tokenize(chars, 0, chars.length);
622                    tokens = (String[]) split.toArray(new String[split.size()]);
623                }
624            }
625        }
626    
627        /**
628         * Internal method to performs the tokenization.
629         * <p>
630         * Most users of this class do not need to call this method. This method
631         * will be called automatically by other (public) methods when required.
632         * <p>
633         * This method exists to allow subclasses to add code before or after the
634         * tokenization. For example, a subclass could alter the character array,
635         * offset or count to be parsed, or call the tokenizer multiple times on
636         * multiple strings. It is also be possible to filter the results.
637         * <p>
638         * <code>StrTokenizer</code> will always pass a zero offset and a count
639         * equal to the length of the array to this method, however a subclass
640         * may pass other values, or even an entirely different array.
641         * 
642         * @param chars  the character array being tokenized, may be null
643         * @param offset  the start position within the character array, must be valid
644         * @param count  the number of characters to tokenize, must be valid
645         * @return the modifiable list of String tokens, unmodifiable if null array or zero count
646         */
647        protected List tokenize(char[] chars, int offset, int count) {
648            if (chars == null || count == 0) {
649                return Collections.EMPTY_LIST;
650            }
651            StrBuilder buf = new StrBuilder();
652            List tokens = new ArrayList();
653            int pos = offset;
654            
655            // loop around the entire buffer
656            while (pos >= 0 && pos < count) {
657                // find next token
658                pos = readNextToken(chars, pos, count, buf, tokens);
659                
660                // handle case where end of string is a delimiter
661                if (pos >= count) {
662                    addToken(tokens, "");
663                }
664            }
665            return tokens;
666        }
667    
668        /**
669         * Adds a token to a list, paying attention to the parameters we've set.
670         *
671         * @param list  the list to add to
672         * @param tok  the token to add
673         */
674        private void addToken(List list, String tok) {
675            if (tok == null || tok.length() == 0) {
676                if (isIgnoreEmptyTokens()) {
677                    return;
678                }
679                if (isEmptyTokenAsNull()) {
680                    tok = null;
681                }
682            }
683            list.add(tok);
684        }
685    
686        /**
687         * Reads character by character through the String to get the next token.
688         *
689         * @param chars  the character array being tokenized
690         * @param start  the first character of field
691         * @param len  the length of the character array being tokenized
692         * @param workArea  a temporary work area
693         * @param tokens  the list of parsed tokens
694         * @return the starting position of the next field (the character
695         *  immediately after the delimiter), or -1 if end of string found
696         */
697        private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) {
698            // skip all leading whitespace, unless it is the
699            // field delimiter or the quote character
700            while (start < len) {
701                int removeLen = Math.max(
702                        getIgnoredMatcher().isMatch(chars, start, start, len),
703                        getTrimmerMatcher().isMatch(chars, start, start, len));
704                if (removeLen == 0 ||
705                    getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
706                    getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
707                    break;
708                }
709                start += removeLen;
710            }
711            
712            // handle reaching end
713            if (start >= len) {
714                addToken(tokens, "");
715                return -1;
716            }
717            
718            // handle empty token
719            int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
720            if (delimLen > 0) {
721                addToken(tokens, "");
722                return start + delimLen;
723            }
724            
725            // handle found token
726            int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
727            if (quoteLen > 0) {
728                return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
729            }
730            return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
731        }
732    
733        /**
734         * Reads a possibly quoted string token.
735         *
736         * @param chars  the character array being tokenized
737         * @param start  the first character of field
738         * @param len  the length of the character array being tokenized
739         * @param workArea  a temporary work area
740         * @param tokens  the list of parsed tokens
741         * @param quoteStart  the start position of the matched quote, 0 if no quoting
742         * @param quoteLen  the length of the matched quote, 0 if no quoting
743         * @return the starting position of the next field (the character
744         *  immediately after the delimiter, or if end of string found,
745         *  then the length of string
746         */
747        private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
748                                   List tokens, int quoteStart, int quoteLen) 
749        {
750            // Loop until we've found the end of the quoted
751            // string or the end of the input
752            workArea.clear();
753            int pos = start;
754            boolean quoting = (quoteLen > 0);
755            int trimStart = 0;
756            
757            while (pos < len) {
758                // quoting mode can occur several times throughout a string
759                // we must switch between quoting and non-quoting until we
760                // encounter a non-quoted delimiter, or end of string
761                if (quoting) {
762                    // In quoting mode
763                    
764                    // If we've found a quote character, see if it's
765                    // followed by a second quote.  If so, then we need
766                    // to actually put the quote character into the token
767                    // rather than end the token.
768                    if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
769                        if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
770                            // matched pair of quotes, thus an escaped quote
771                            workArea.append(chars, pos, quoteLen);
772                            pos += (quoteLen * 2);
773                            trimStart = workArea.size();
774                            continue;
775                        }
776                        
777                        // end of quoting
778                        quoting = false;
779                        pos += quoteLen;
780                        continue;
781                    }
782                    
783                    // copy regular character from inside quotes
784                    workArea.append(chars[pos++]);
785                    trimStart = workArea.size();
786                    
787                } else {
788                    // Not in quoting mode
789                    
790                    // check for delimiter, and thus end of token
791                    int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
792                    if (delimLen > 0) {
793                        // return condition when end of token found
794                        addToken(tokens, workArea.substring(0, trimStart));
795                        return pos + delimLen;
796                    }
797                    
798                    // check for quote, and thus back into quoting mode
799                    if (quoteLen > 0) {
800                        if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
801                            quoting = true;
802                            pos += quoteLen;
803                            continue;
804                        }
805                    }
806                    
807                    // check for ignored (outside quotes), and ignore
808                    int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
809                    if (ignoredLen > 0) {
810                        pos += ignoredLen;
811                        continue;
812                    }
813                    
814                    // check for trimmed character
815                    // don't yet know if its at the end, so copy to workArea
816                    // use trimStart to keep track of trim at the end
817                    int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
818                    if (trimmedLen > 0) {
819                        workArea.append(chars, pos, trimmedLen);
820                        pos += trimmedLen;
821                        continue;
822                    }
823                    
824                    // copy regular character from outside quotes
825                    workArea.append(chars[pos++]);
826                    trimStart = workArea.size();
827                }
828            }
829            
830            // return condition when end of string found
831            addToken(tokens, workArea.substring(0, trimStart));
832            return -1;
833        }
834    
835        /**
836         * Checks if the characters at the index specified match the quote
837         * already matched in readNextToken().
838         *
839         * @param chars  the character array being tokenized
840         * @param pos  the position to check for a quote
841         * @param len  the length of the character array being tokenized
842         * @param quoteStart  the start position of the matched quote, 0 if no quoting
843         * @param quoteLen  the length of the matched quote, 0 if no quoting
844         * @return true if a quote is matched
845         */
846        private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
847            for (int i = 0; i < quoteLen; i++) {
848                if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
849                    return false;
850                }
851            }
852            return true;
853        }
854    
855        // Delimiter
856        //-----------------------------------------------------------------------
857        /**
858         * Gets the field delimiter matcher.
859         *
860         * @return the delimiter matcher in use
861         */
862        public StrMatcher getDelimiterMatcher() {
863            return this.delimMatcher;
864        }
865    
866        /**
867         * Sets the field delimiter matcher.
868         * <p>
869         * The delimitier is used to separate one token from another.
870         *
871         * @param delim  the delimiter matcher to use
872         * @return this, to enable chaining
873         */
874        public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
875            if (delim == null) {
876                this.delimMatcher = StrMatcher.noneMatcher();
877            } else {
878                this.delimMatcher = delim;
879            }
880            return this;
881        }
882    
883        /**
884         * Sets the field delimiter character.
885         *
886         * @param delim  the delimiter character to use
887         * @return this, to enable chaining
888         */
889        public StrTokenizer setDelimiterChar(char delim) {
890            return setDelimiterMatcher(StrMatcher.charMatcher(delim));
891        }
892    
893        /**
894         * Sets the field delimiter string.
895         *
896         * @param delim  the delimiter string to use
897         * @return this, to enable chaining
898         */
899        public StrTokenizer setDelimiterString(String delim) {
900            return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
901        }
902    
903        // Quote
904        //-----------------------------------------------------------------------
905        /**
906         * Gets the quote matcher currently in use.
907         * <p>
908         * The quote character is used to wrap data between the tokens.
909         * This enables delimiters to be entered as data.
910         * The default value is '"' (double quote).
911         *
912         * @return the quote matcher in use
913         */
914        public StrMatcher getQuoteMatcher() {
915            return quoteMatcher;
916        }
917    
918        /**
919         * Set the quote matcher to use.
920         * <p>
921         * The quote character is used to wrap data between the tokens.
922         * This enables delimiters to be entered as data.
923         *
924         * @param quote  the quote matcher to use, null ignored
925         * @return this, to enable chaining
926         */
927        public StrTokenizer setQuoteMatcher(StrMatcher quote) {
928            if (quote != null) {
929                this.quoteMatcher = quote;
930            }
931            return this;
932        }
933    
934        /**
935         * Sets the quote character to use.
936         * <p>
937         * The quote character is used to wrap data between the tokens.
938         * This enables delimiters to be entered as data.
939         *
940         * @param quote  the quote character to use
941         * @return this, to enable chaining
942         */
943        public StrTokenizer setQuoteChar(char quote) {
944            return setQuoteMatcher(StrMatcher.charMatcher(quote));
945        }
946    
947        // Ignored
948        //-----------------------------------------------------------------------
949        /**
950         * Gets the ignored character matcher.
951         * <p>
952         * These characters are ignored when parsing the String, unless they are
953         * within a quoted region.
954         * The default value is not to ignore anything.
955         *
956         * @return the ignored matcher in use
957         */
958        public StrMatcher getIgnoredMatcher() {
959            return ignoredMatcher;
960        }
961    
962        /**
963         * Set the matcher for characters to ignore.
964         * <p>
965         * These characters are ignored when parsing the String, unless they are
966         * within a quoted region.
967         *
968         * @param ignored  the ignored matcher to use, null ignored
969         * @return this, to enable chaining
970         */
971        public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
972            if (ignored != null) {
973                this.ignoredMatcher = ignored;
974            }
975            return this;
976        }
977    
978        /**
979         * Set the character to ignore.
980         * <p>
981         * This character is ignored when parsing the String, unless it is
982         * within a quoted region.
983         *
984         * @param ignored  the ignored character to use
985         * @return this, to enable chaining
986         */
987        public StrTokenizer setIgnoredChar(char ignored) {
988            return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
989        }
990    
991        // Trimmer
992        //-----------------------------------------------------------------------
993        /**
994         * Gets the trimmer character matcher.
995         * <p>
996         * These characters are trimmed off on each side of the delimiter
997         * until the token or quote is found.
998         * The default value is not to trim anything.
999         *
1000         * @return the trimmer matcher in use
1001         */
1002        public StrMatcher getTrimmerMatcher() {
1003            return trimmerMatcher;
1004        }
1005    
1006        /**
1007         * Sets the matcher for characters to trim.
1008         * <p>
1009         * These characters are trimmed off on each side of the delimiter
1010         * until the token or quote is found.
1011         *
1012         * @param trimmer  the trimmer matcher to use, null ignored
1013         * @return this, to enable chaining
1014         */
1015        public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1016            if (trimmer != null) {
1017                this.trimmerMatcher = trimmer;
1018            }
1019            return this;
1020        }
1021    
1022        //-----------------------------------------------------------------------
1023        /**
1024         * Gets whether the tokenizer currently returns empty tokens as null.
1025         * The default for this property is false.
1026         *
1027         * @return true if empty tokens are returned as null
1028         */
1029        public boolean isEmptyTokenAsNull() {
1030            return this.emptyAsNull;
1031        }
1032    
1033        /**
1034         * Sets whether the tokenizer should return empty tokens as null.
1035         * The default for this property is false.
1036         *
1037         * @param emptyAsNull  whether empty tokens are returned as null
1038         * @return this, to enable chaining
1039         */
1040        public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1041            this.emptyAsNull = emptyAsNull;
1042            return this;
1043        }
1044    
1045        //-----------------------------------------------------------------------
1046        /**
1047         * Gets whether the tokenizer currently ignores empty tokens.
1048         * The default for this property is true.
1049         *
1050         * @return true if empty tokens are not returned
1051         */
1052        public boolean isIgnoreEmptyTokens() {
1053            return ignoreEmptyTokens;
1054        }
1055    
1056        /**
1057         * Sets whether the tokenizer should ignore and not return empty tokens.
1058         * The default for this property is true.
1059         *
1060         * @param ignoreEmptyTokens  whether empty tokens are not returned
1061         * @return this, to enable chaining
1062         */
1063        public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1064            this.ignoreEmptyTokens = ignoreEmptyTokens;
1065            return this;
1066        }
1067    
1068        //-----------------------------------------------------------------------
1069        /**
1070         * Gets the String content that the tokenizer is parsing.
1071         *
1072         * @return the string content being parsed
1073         */
1074        public String getContent() {
1075            if (chars == null) {
1076                return null;
1077            }
1078            return new String(chars);
1079        }
1080    
1081        //-----------------------------------------------------------------------
1082        /**
1083         * Creates a new instance of this Tokenizer. The new instance is reset so
1084         * that it will be at the start of the token list.
1085         * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1086         * 
1087         * @return a new instance of this Tokenizer which has been reset.
1088         */
1089        public Object clone() {
1090            try {
1091                return cloneReset();
1092            } catch (CloneNotSupportedException ex) {
1093                return null;
1094            }
1095        }
1096    
1097        /**
1098         * Creates a new instance of this Tokenizer. The new instance is reset so that
1099         * it will be at the start of the token list.
1100         * 
1101         * @return a new instance of this Tokenizer which has been reset.
1102         * @throws CloneNotSupportedException if there is a problem cloning
1103         */
1104        Object cloneReset() throws CloneNotSupportedException {
1105            // this method exists to enable 100% test coverage
1106            StrTokenizer cloned = (StrTokenizer) super.clone();
1107            if (cloned.chars != null) {
1108                cloned.chars = (char[]) cloned.chars.clone();
1109            }
1110            cloned.reset();
1111            return cloned;
1112        }
1113    
1114        //-----------------------------------------------------------------------
1115        /**
1116         * Gets the String content that the tokenizer is parsing.
1117         *
1118         * @return the string content being parsed
1119         */
1120        public String toString() {
1121            if (tokens == null) {
1122                return "StrTokenizer[not tokenized yet]";
1123            }
1124            return "StrTokenizer" + getTokenList();
1125        }
1126    
1127    }