View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  /**
26   * Tokenizes a string based based on delimiters (separators)
27   * and supporting quoting and ignored character concepts.
28   * <p>
29   * This class can split a String into many smaller strings. It aims
30   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
31   * however it offers much more control and flexibility including implementing
32   * the <code>ListIterator</code> interface. By default, it is set up
33   * like <code>StringTokenizer</code>.
34   * <p>
35   * The input String is split into a number of <i>tokens</i>.
36   * Each token is separated from the next String by a <i>delimiter</i>.
37   * One or more delimiter characters must be specified.
38   * <p>
39   * Each token may be surrounded by quotes.
40   * The <i>quote</i> matcher specifies the quote character(s).
41   * A quote may be escaped within a quoted section by duplicating itself.
42   * <p>
43   * Between each token and the delimiter are potentially characters that need trimming.
44   * The <i>trimmer</i> matcher specifies these characters.
45   * One usage might be to trim whitespace characters.
46   * <p>
47   * At any point outside the quotes there might potentially be invalid characters.
48   * The <i>ignored</i> matcher specifies these characters to be removed.
49   * One usage might be to remove new line characters.
50   * <p>
51   * Empty tokens may be removed or returned as null.
52   * <pre>
53   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
54   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
55   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56   * </pre>
57   * <p>
58   *
59   * This tokenizer has the following properties and options:
60   *
61   * <table>
62   *  <tr>
63   *   <th>Property</th><th>Type</th><th>Default</th>
64   *  </tr>
65   *  <tr>
66   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
67   *  </tr>
68   *  <tr>
69   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
70   *  </tr>
71   *  <tr>
72   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
73   *  </tr>
74   *  <tr>
75   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
76   *  </tr>
77   *  <tr>
78   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
79   *  </tr>
80   * </table>
81   *
82   * @author Matthew Inger
83   * @author Stephen Colebourne
84   * @author Gary D. Gregory
85   * @since 2.2
86   * @version $Id: StrTokenizer.java 592077 2007-11-05 16:47:10Z mbenson $
87   */
88  public class StrTokenizer implements ListIterator, Cloneable {
89  
90      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
91      private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
92      static {
93          CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
94          CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
95          CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
96          CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
97          CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
98          CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
99          CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100 
101         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108     }
109 
110     /** The text to work on. */
111     private char chars[];
112     /** The parsed tokens */
113     private String tokens[];
114     /** The current iteration position */
115     private int tokenPos;
116 
117     /** The delimiter matcher */
118     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119     /** The quote matcher */
120     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121     /** The ignored matcher */
122     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123     /** The trimmer matcher */
124     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125 
126     /** Whether to return empty tokens as null */
127     private boolean emptyAsNull = false;
128     /** Whether to ignore empty tokens */
129     private boolean ignoreEmptyTokens = true;
130 
131     //-----------------------------------------------------------------------
132 
133     /**
134      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135      * 
136      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137      */
138     private static StrTokenizer getCSVClone() {
139         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140     }
141 
142     /**
143      * Gets a new tokenizer instance which parses Comma Separated Value strings
144      * initializing it with the given input.  The default for CSV processing
145      * will be trim whitespace from both ends (which can be overridden with
146      * the setTrimmer method).
147      * <p>
148      * You must call a "reset" method to set the string which you want to parse.
149      * @return a new tokenizer instance which parses Comma Separated Value strings
150      */
151     public static StrTokenizer getCSVInstance() {
152         return getCSVClone();
153     }
154 
155     /**
156      * Gets a new tokenizer instance which parses Comma Separated Value strings
157      * initializing it with the given input.  The default for CSV processing
158      * will be trim whitespace from both ends (which can be overridden with
159      * the setTrimmer method).
160      *
161      * @param input  the text to parse
162      * @return a new tokenizer instance which parses Comma Separated Value strings
163      */
164     public static StrTokenizer getCSVInstance(String input) {
165         StrTokenizer tok = getCSVClone();
166         tok.reset(input);
167         return tok;
168     }
169 
170     /**
171      * Gets a new tokenizer instance which parses Comma Separated Value strings
172      * initializing it with the given input.  The default for CSV processing
173      * will be trim whitespace from both ends (which can be overridden with
174      * the setTrimmer method).
175      *
176      * @param input  the text to parse
177      * @return a new tokenizer instance which parses Comma Separated Value strings
178      */
179     public static StrTokenizer getCSVInstance(char[] input) {
180         StrTokenizer tok = getCSVClone();
181         tok.reset(input);
182         return tok;
183     }
184 
185     /**
186      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187      * 
188      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189      */
190     private static StrTokenizer getTSVClone() {
191         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192     }
193 
194 
195     /**
196      * Gets a new tokenizer instance which parses Tab Separated Value strings.
197      * The default for CSV processing will be trim whitespace from both ends
198      * (which can be overridden with the setTrimmer method).
199      * <p>
200      * You must call a "reset" method to set the string which you want to parse.
201      * @return a new tokenizer instance which parses Tab Separated Value strings.
202      */
203     public static StrTokenizer getTSVInstance() {
204         return getTSVClone();
205     }
206 
207     /**
208      * Gets a new tokenizer instance which parses Tab Separated Value strings.
209      * The default for CSV processing will be trim whitespace from both ends
210      * (which can be overridden with the setTrimmer method).
211      * @param input  the string to parse
212      * @return a new tokenizer instance which parses Tab Separated Value strings.
213      */
214     public static StrTokenizer getTSVInstance(String input) {
215         StrTokenizer tok = getTSVClone();
216         tok.reset(input);
217         return tok;
218     }
219 
220     /**
221      * Gets a new tokenizer instance which parses Tab Separated Value strings.
222      * The default for CSV processing will be trim whitespace from both ends
223      * (which can be overridden with the setTrimmer method).
224      * @param input  the string to parse
225      * @return a new tokenizer instance which parses Tab Separated Value strings.
226      */
227     public static StrTokenizer getTSVInstance(char[] input) {
228         StrTokenizer tok = getTSVClone();
229         tok.reset(input);
230         return tok;
231     }
232 
233     //-----------------------------------------------------------------------
234     /**
235      * Constructs a tokenizer splitting on space, tab, newline and formfeed
236      * as per StringTokenizer, but with no text to tokenize.
237      * <p>
238      * This constructor is normally used with {@link #reset(String)}.
239      */
240     public StrTokenizer() {
241         super();
242         this.chars = null;
243     }
244 
245     /**
246      * Constructs a tokenizer splitting on space, tab, newline and formfeed
247      * as per StringTokenizer.
248      *
249      * @param input  the string which is to be parsed
250      */
251     public StrTokenizer(String input) {
252         super();
253         if (input != null) {
254             chars = input.toCharArray();
255         } else {
256             chars = null;
257         }
258     }
259 
260     /**
261      * Constructs a tokenizer splitting on the specified delimiter character.
262      *
263      * @param input  the string which is to be parsed
264      * @param delim  the field delimiter character
265      */
266     public StrTokenizer(String input, char delim) {
267         this(input);
268         setDelimiterChar(delim);
269     }
270 
271     /**
272      * Constructs a tokenizer splitting on the specified delimiter string.
273      *
274      * @param input  the string which is to be parsed
275      * @param delim  the field delimiter string
276      */
277     public StrTokenizer(String input, String delim) {
278         this(input);
279         setDelimiterString(delim);
280     }
281 
282     /**
283      * Constructs a tokenizer splitting using the specified delimiter matcher.
284      *
285      * @param input  the string which is to be parsed
286      * @param delim  the field delimiter matcher
287      */
288     public StrTokenizer(String input, StrMatcher delim) {
289         this(input);
290         setDelimiterMatcher(delim);
291     }
292 
293     /**
294      * Constructs a tokenizer splitting on the specified delimiter character
295      * and handling quotes using the specified quote character.
296      *
297      * @param input  the string which is to be parsed
298      * @param delim  the field delimiter character
299      * @param quote  the field quoted string character
300      */
301     public StrTokenizer(String input, char delim, char quote) {
302         this(input, delim);
303         setQuoteChar(quote);
304     }
305 
306     /**
307      * Constructs a tokenizer splitting using the specified delimiter matcher
308      * and handling quotes using the specified quote matcher.
309      *
310      * @param input  the string which is to be parsed
311      * @param delim  the field delimiter matcher
312      * @param quote  the field quoted string matcher
313      */
314     public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
315         this(input, delim);
316         setQuoteMatcher(quote);
317     }
318 
319     /**
320      * Constructs a tokenizer splitting on space, tab, newline and formfeed
321      * as per StringTokenizer.
322      * <p>
323      * The input character array is not cloned, and must not be altered after
324      * passing in to this method.
325      *
326      * @param input  the string which is to be parsed, not cloned
327      */
328     public StrTokenizer(char[] input) {
329         super();
330         this.chars = input;
331     }
332 
333     /**
334      * Constructs a tokenizer splitting on the specified character.
335      * <p>
336      * The input character array is not cloned, and must not be altered after
337      * passing in to this method.
338      *
339      * @param input  the string which is to be parsed, not cloned
340      * @param delim the field delimiter character
341      */
342     public StrTokenizer(char[] input, char delim) {
343         this(input);
344         setDelimiterChar(delim);
345     }
346 
347     /**
348      * Constructs a tokenizer splitting on the specified string.
349      * <p>
350      * The input character array is not cloned, and must not be altered after
351      * passing in to this method.
352      *
353      * @param input  the string which is to be parsed, not cloned
354      * @param delim the field delimiter string
355      */
356     public StrTokenizer(char[] input, String delim) {
357         this(input);
358         setDelimiterString(delim);
359     }
360 
361     /**
362      * Constructs a tokenizer splitting using the specified delimiter matcher.
363      * <p>
364      * The input character array is not cloned, and must not be altered after
365      * passing in to this method.
366      *
367      * @param input  the string which is to be parsed, not cloned
368      * @param delim  the field delimiter matcher
369      */
370     public StrTokenizer(char[] input, StrMatcher delim) {
371         this(input);
372         setDelimiterMatcher(delim);
373     }
374 
375     /**
376      * Constructs a tokenizer splitting on the specified delimiter character
377      * and handling quotes using the specified quote character.
378      * <p>
379      * The input character array is not cloned, and must not be altered after
380      * passing in to this method.
381      *
382      * @param input  the string which is to be parsed, not cloned
383      * @param delim  the field delimiter character
384      * @param quote  the field quoted string character
385      */
386     public StrTokenizer(char[] input, char delim, char quote) {
387         this(input, delim);
388         setQuoteChar(quote);
389     }
390 
391     /**
392      * Constructs a tokenizer splitting using the specified delimiter matcher
393      * and handling quotes using the specified quote matcher.
394      * <p>
395      * The input character array is not cloned, and must not be altered after
396      * passing in to this method.
397      *
398      * @param input  the string which is to be parsed, not cloned
399      * @param delim  the field delimiter character
400      * @param quote  the field quoted string character
401      */
402     public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
403         this(input, delim);
404         setQuoteMatcher(quote);
405     }
406 
407     // API
408     //-----------------------------------------------------------------------
409     /**
410      * Gets the number of tokens found in the String.
411      *
412      * @return the number of matched tokens
413      */
414     public int size() {
415         checkTokenized();
416         return tokens.length;
417     }
418 
419     /**
420      * Gets the next token from the String.
421      *
422      * @return the next sequential token, or null when no more tokens are found
423      */
424     public String nextToken() {
425         if (hasNext()) {
426             return tokens[tokenPos++];
427         }
428         return null;
429     }
430 
431     /**
432      * Gets the previous token from the String.
433      *
434      * @return the previous sequential token, or null when no more tokens are found
435      */
436     public String previousToken() {
437         if (hasPrevious()) {
438             return tokens[--tokenPos];
439         }
440         return null;
441     }
442 
443     /**
444      * Gets a copy of the full token list as an independent modifiable array.
445      *
446      * @return the tokens as a String array
447      */
448     public String[] getTokenArray() {
449         checkTokenized();
450         return (String[]) tokens.clone();
451     }
452 
453     /**
454      * Gets a copy of the full token list as an independent modifiable list.
455      *
456      * @return the tokens as a String array
457      */
458     public List getTokenList() {
459         checkTokenized();
460         List list = new ArrayList(tokens.length);
461         for (int i = 0; i < tokens.length; i++) {
462             list.add(tokens[i]);
463         }
464         return list;
465     }
466 
467     /**
468      * Resets this tokenizer, forgetting all parsing and iteration already completed.
469      * <p>
470      * This method allows the same tokenizer to be reused for the same String.
471      *
472      * @return this, to enable chaining
473      */
474     public StrTokenizer reset() {
475         tokenPos = 0;
476         tokens = null;
477         return this;
478     }
479 
480     /**
481      * Reset this tokenizer, giving it a new input string to parse.
482      * In this manner you can re-use a tokenizer with the same settings
483      * on multiple input lines.
484      *
485      * @param input  the new string to tokenize, null sets no text to parse
486      * @return this, to enable chaining
487      */
488     public StrTokenizer reset(String input) {
489         reset();
490         if (input != null) {
491             this.chars = input.toCharArray();
492         } else {
493             this.chars = null;
494         }
495         return this;
496     }
497 
498     /**
499      * Reset this tokenizer, giving it a new input string to parse.
500      * In this manner you can re-use a tokenizer with the same settings
501      * on multiple input lines.
502      * <p>
503      * The input character array is not cloned, and must not be altered after
504      * passing in to this method.
505      *
506      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
507      * @return this, to enable chaining
508      */
509     public StrTokenizer reset(char[] input) {
510         reset();
511         this.chars = input;
512         return this;
513     }
514 
515     // ListIterator
516     //-----------------------------------------------------------------------
517     /**
518      * Checks whether there are any more tokens.
519      *
520      * @return true if there are more tokens
521      */
522     public boolean hasNext() {
523         checkTokenized();
524         return tokenPos < tokens.length;
525     }
526 
527     /**
528      * Gets the next token. This method is equivalent to {@link #nextToken()}.
529      *
530      * @return the next String token
531      */
532     public Object next() {
533         if (hasNext()) {
534             return tokens[tokenPos++];
535         }
536         throw new NoSuchElementException();
537     }
538 
539     /**
540      * Gets the index of the next token to return.
541      *
542      * @return the next token index
543      */
544     public int nextIndex() {
545         return tokenPos;
546     }
547 
548     /**
549      * Checks whether there are any previous tokens that can be iterated to.
550      *
551      * @return true if there are previous tokens
552      */
553     public boolean hasPrevious() {
554         checkTokenized();
555         return tokenPos > 0;
556     }
557 
558     /**
559      * Gets the token previous to the last returned token.
560      *
561      * @return the previous token
562      */
563     public Object previous() {
564         if (hasPrevious()) {
565             return tokens[--tokenPos];
566         }
567         throw new NoSuchElementException();
568     }
569 
570     /**
571      * Gets the index of the previous token.
572      *
573      * @return the previous token index
574      */
575     public int previousIndex() {
576         return tokenPos - 1;
577     }
578 
579     /**
580      * Unsupported ListIterator operation.
581      *
582      * @throws UnsupportedOperationException always
583      */
584     public void remove() {
585         throw new UnsupportedOperationException("remove() is unsupported");
586     }
587 
588     /**
589      * Unsupported ListIterator operation.
590      * @param obj this parameter ignored.
591      * @throws UnsupportedOperationException always
592      */
593     public void set(Object obj) {
594         throw new UnsupportedOperationException("set() is unsupported");
595     }
596 
597     /**
598      * Unsupported ListIterator operation.
599      * @param obj this parameter ignored.
600      * @throws UnsupportedOperationException always
601      */
602     public void add(Object obj) {
603         throw new UnsupportedOperationException("add() is unsupported");
604     }
605 
606     // Implementation
607     //-----------------------------------------------------------------------
608     /**
609      * Checks if tokenization has been done, and if not then do it.
610      */
611     private void checkTokenized() {
612         if (tokens == null) {
613             if (chars == null) {
614                 // still call tokenize as subclass may do some work
615                 List split = tokenize(null, 0, 0);
616                 tokens = (String[]) split.toArray(new String[split.size()]);
617             } else {
618                 List split = tokenize(chars, 0, chars.length);
619                 tokens = (String[]) split.toArray(new String[split.size()]);
620             }
621         }
622     }
623 
624     /**
625      * Internal method to performs the tokenization.
626      * <p>
627      * Most users of this class do not need to call this method. This method
628      * will be called automatically by other (public) methods when required.
629      * <p>
630      * This method exists to allow subclasses to add code before or after the
631      * tokenization. For example, a subclass could alter the character array,
632      * offset or count to be parsed, or call the tokenizer multiple times on
633      * multiple strings. It is also be possible to filter the results.
634      * <p>
635      * <code>StrTokenizer</code> will always pass a zero offset and a count
636      * equal to the length of the array to this method, however a subclass
637      * may pass other values, or even an entirely different array.
638      * 
639      * @param chars  the character array being tokenized, may be null
640      * @param offset  the start position within the character array, must be valid
641      * @param count  the number of characters to tokenize, must be valid
642      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
643      */
644     protected List tokenize(char[] chars, int offset, int count) {
645         if (chars == null || count == 0) {
646             return Collections.EMPTY_LIST;
647         }
648         StrBuilder buf = new StrBuilder();
649         List tokens = new ArrayList();
650         int pos = offset;
651         
652         // loop around the entire buffer
653         while (pos >= 0 && pos < count) {
654             // find next token
655             pos = readNextToken(chars, pos, count, buf, tokens);
656             
657             // handle case where end of string is a delimiter
658             if (pos >= count) {
659                 addToken(tokens, "");
660             }
661         }
662         return tokens;
663     }
664 
665     /**
666      * Adds a token to a list, paying attention to the parameters we've set.
667      *
668      * @param list  the list to add to
669      * @param tok  the token to add
670      */
671     private void addToken(List list, String tok) {
672         if (tok == null || tok.length() == 0) {
673             if (isIgnoreEmptyTokens()) {
674                 return;
675             }
676             if (isEmptyTokenAsNull()) {
677                 tok = null;
678             }
679         }
680         list.add(tok);
681     }
682 
683     /**
684      * Reads character by character through the String to get the next token.
685      *
686      * @param chars  the character array being tokenized
687      * @param start  the first character of field
688      * @param len  the length of the character array being tokenized
689      * @param workArea  a temporary work area
690      * @param tokens  the list of parsed tokens
691      * @return the starting position of the next field (the character
692      *  immediately after the delimiter), or -1 if end of string found
693      */
694     private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) {
695         // skip all leading whitespace, unless it is the
696         // field delimiter or the quote character
697         while (start < len) {
698             int removeLen = Math.max(
699                     getIgnoredMatcher().isMatch(chars, start, start, len),
700                     getTrimmerMatcher().isMatch(chars, start, start, len));
701             if (removeLen == 0 ||
702                 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
703                 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
704                 break;
705             }
706             start += removeLen;
707         }
708         
709         // handle reaching end
710         if (start >= len) {
711             addToken(tokens, "");
712             return -1;
713         }
714         
715         // handle empty token
716         int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
717         if (delimLen > 0) {
718             addToken(tokens, "");
719             return start + delimLen;
720         }
721         
722         // handle found token
723         int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
724         if (quoteLen > 0) {
725             return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
726         }
727         return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
728     }
729 
730     /**
731      * Reads a possibly quoted string token.
732      *
733      * @param chars  the character array being tokenized
734      * @param start  the first character of field
735      * @param len  the length of the character array being tokenized
736      * @param workArea  a temporary work area
737      * @param tokens  the list of parsed tokens
738      * @param quoteStart  the start position of the matched quote, 0 if no quoting
739      * @param quoteLen  the length of the matched quote, 0 if no quoting
740      * @return the starting position of the next field (the character
741      *  immediately after the delimiter, or if end of string found,
742      *  then the length of string
743      */
744     private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 
745                                List tokens, int quoteStart, int quoteLen) 
746     {
747         // Loop until we've found the end of the quoted
748         // string or the end of the input
749         workArea.clear();
750         int pos = start;
751         boolean quoting = (quoteLen > 0);
752         int trimStart = 0;
753         
754         while (pos < len) {
755             // quoting mode can occur several times throughout a string
756             // we must switch between quoting and non-quoting until we
757             // encounter a non-quoted delimiter, or end of string
758             if (quoting) {
759                 // In quoting mode
760                 
761                 // If we've found a quote character, see if it's
762                 // followed by a second quote.  If so, then we need
763                 // to actually put the quote character into the token
764                 // rather than end the token.
765                 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
766                     if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
767                         // matched pair of quotes, thus an escaped quote
768                         workArea.append(chars, pos, quoteLen);
769                         pos += (quoteLen * 2);
770                         trimStart = workArea.size();
771                         continue;
772                     }
773                     
774                     // end of quoting
775                     quoting = false;
776                     pos += quoteLen;
777                     continue;
778                 }
779                 
780                 // copy regular character from inside quotes
781                 workArea.append(chars[pos++]);
782                 trimStart = workArea.size();
783                 
784             } else {
785                 // Not in quoting mode
786                 
787                 // check for delimiter, and thus end of token
788                 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
789                 if (delimLen > 0) {
790                     // return condition when end of token found
791                     addToken(tokens, workArea.substring(0, trimStart));
792                     return pos + delimLen;
793                 }
794                 
795                 // check for quote, and thus back into quoting mode
796                 if (quoteLen > 0) {
797                     if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
798                         quoting = true;
799                         pos += quoteLen;
800                         continue;
801                     }
802                 }
803                 
804                 // check for ignored (outside quotes), and ignore
805                 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
806                 if (ignoredLen > 0) {
807                     pos += ignoredLen;
808                     continue;
809                 }
810                 
811                 // check for trimmed character
812                 // don't yet know if its at the end, so copy to workArea
813                 // use trimStart to keep track of trim at the end
814                 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
815                 if (trimmedLen > 0) {
816                     workArea.append(chars, pos, trimmedLen);
817                     pos += trimmedLen;
818                     continue;
819                 }
820                 
821                 // copy regular character from outside quotes
822                 workArea.append(chars[pos++]);
823                 trimStart = workArea.size();
824             }
825         }
826         
827         // return condition when end of string found
828         addToken(tokens, workArea.substring(0, trimStart));
829         return -1;
830     }
831 
832     /**
833      * Checks if the characters at the index specified match the quote
834      * already matched in readNextToken().
835      *
836      * @param chars  the character array being tokenized
837      * @param pos  the position to check for a quote
838      * @param len  the length of the character array being tokenized
839      * @param quoteStart  the start position of the matched quote, 0 if no quoting
840      * @param quoteLen  the length of the matched quote, 0 if no quoting
841      * @return true if a quote is matched
842      */
843     private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
844         for (int i = 0; i < quoteLen; i++) {
845             if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
846                 return false;
847             }
848         }
849         return true;
850     }
851 
852     // Delimiter
853     //-----------------------------------------------------------------------
854     /**
855      * Gets the field delimiter matcher.
856      *
857      * @return the delimiter matcher in use
858      */
859     public StrMatcher getDelimiterMatcher() {
860         return this.delimMatcher;
861     }
862 
863     /**
864      * Sets the field delimiter matcher.
865      * <p>
866      * The delimitier is used to separate one token from another.
867      *
868      * @param delim  the delimiter matcher to use
869      * @return this, to enable chaining
870      */
871     public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
872         if (delim == null) {
873             this.delimMatcher = StrMatcher.noneMatcher();
874         } else {
875             this.delimMatcher = delim;
876         }
877         return this;
878     }
879 
880     /**
881      * Sets the field delimiter character.
882      *
883      * @param delim  the delimiter character to use
884      * @return this, to enable chaining
885      */
886     public StrTokenizer setDelimiterChar(char delim) {
887         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
888     }
889 
890     /**
891      * Sets the field delimiter string.
892      *
893      * @param delim  the delimiter string to use
894      * @return this, to enable chaining
895      */
896     public StrTokenizer setDelimiterString(String delim) {
897         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
898     }
899 
900     // Quote
901     //-----------------------------------------------------------------------
902     /**
903      * Gets the quote matcher currently in use.
904      * <p>
905      * The quote character is used to wrap data between the tokens.
906      * This enables delimiters to be entered as data.
907      * The default value is '"' (double quote).
908      *
909      * @return the quote matcher in use
910      */
911     public StrMatcher getQuoteMatcher() {
912         return quoteMatcher;
913     }
914 
915     /**
916      * Set the quote matcher to use.
917      * <p>
918      * The quote character is used to wrap data between the tokens.
919      * This enables delimiters to be entered as data.
920      *
921      * @param quote  the quote matcher to use, null ignored
922      * @return this, to enable chaining
923      */
924     public StrTokenizer setQuoteMatcher(StrMatcher quote) {
925         if (quote != null) {
926             this.quoteMatcher = quote;
927         }
928         return this;
929     }
930 
931     /**
932      * Sets the quote character to use.
933      * <p>
934      * The quote character is used to wrap data between the tokens.
935      * This enables delimiters to be entered as data.
936      *
937      * @param quote  the quote character to use
938      * @return this, to enable chaining
939      */
940     public StrTokenizer setQuoteChar(char quote) {
941         return setQuoteMatcher(StrMatcher.charMatcher(quote));
942     }
943 
944     // Ignored
945     //-----------------------------------------------------------------------
946     /**
947      * Gets the ignored character matcher.
948      * <p>
949      * These characters are ignored when parsing the String, unless they are
950      * within a quoted region.
951      * The default value is not to ignore anything.
952      *
953      * @return the ignored matcher in use
954      */
955     public StrMatcher getIgnoredMatcher() {
956         return ignoredMatcher;
957     }
958 
959     /**
960      * Set the matcher for characters to ignore.
961      * <p>
962      * These characters are ignored when parsing the String, unless they are
963      * within a quoted region.
964      *
965      * @param ignored  the ignored matcher to use, null ignored
966      * @return this, to enable chaining
967      */
968     public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
969         if (ignored != null) {
970             this.ignoredMatcher = ignored;
971         }
972         return this;
973     }
974 
975     /**
976      * Set the character to ignore.
977      * <p>
978      * This character is ignored when parsing the String, unless it is
979      * within a quoted region.
980      *
981      * @param ignored  the ignored character to use
982      * @return this, to enable chaining
983      */
984     public StrTokenizer setIgnoredChar(char ignored) {
985         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
986     }
987 
988     // Trimmer
989     //-----------------------------------------------------------------------
990     /**
991      * Gets the trimmer character matcher.
992      * <p>
993      * These characters are trimmed off on each side of the delimiter
994      * until the token or quote is found.
995      * The default value is not to trim anything.
996      *
997      * @return the trimmer matcher in use
998      */
999     public StrMatcher getTrimmerMatcher() {
1000         return trimmerMatcher;
1001     }
1002 
1003     /**
1004      * Sets the matcher for characters to trim.
1005      * <p>
1006      * These characters are trimmed off on each side of the delimiter
1007      * until the token or quote is found.
1008      *
1009      * @param trimmer  the trimmer matcher to use, null ignored
1010      * @return this, to enable chaining
1011      */
1012     public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1013         if (trimmer != null) {
1014             this.trimmerMatcher = trimmer;
1015         }
1016         return this;
1017     }
1018 
1019     //-----------------------------------------------------------------------
1020     /**
1021      * Gets whether the tokenizer currently returns empty tokens as null.
1022      * The default for this property is false.
1023      *
1024      * @return true if empty tokens are returned as null
1025      */
1026     public boolean isEmptyTokenAsNull() {
1027         return this.emptyAsNull;
1028     }
1029 
1030     /**
1031      * Sets whether the tokenizer should return empty tokens as null.
1032      * The default for this property is false.
1033      *
1034      * @param emptyAsNull  whether empty tokens are returned as null
1035      * @return this, to enable chaining
1036      */
1037     public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1038         this.emptyAsNull = emptyAsNull;
1039         return this;
1040     }
1041 
1042     //-----------------------------------------------------------------------
1043     /**
1044      * Gets whether the tokenizer currently ignores empty tokens.
1045      * The default for this property is true.
1046      *
1047      * @return true if empty tokens are not returned
1048      */
1049     public boolean isIgnoreEmptyTokens() {
1050         return ignoreEmptyTokens;
1051     }
1052 
1053     /**
1054      * Sets whether the tokenizer should ignore and not return empty tokens.
1055      * The default for this property is true.
1056      *
1057      * @param ignoreEmptyTokens  whether empty tokens are not returned
1058      * @return this, to enable chaining
1059      */
1060     public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1061         this.ignoreEmptyTokens = ignoreEmptyTokens;
1062         return this;
1063     }
1064 
1065     //-----------------------------------------------------------------------
1066     /**
1067      * Gets the String content that the tokenizer is parsing.
1068      *
1069      * @return the string content being parsed
1070      */
1071     public String getContent() {
1072         if (chars == null) {
1073             return null;
1074         }
1075         return new String(chars);
1076     }
1077 
1078     //-----------------------------------------------------------------------
1079     /**
1080      * Creates a new instance of this Tokenizer. The new instance is reset so
1081      * that it will be at the start of the token list.
1082      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1083      * 
1084      * @return a new instance of this Tokenizer which has been reset.
1085      */
1086     public Object clone() {
1087         try {
1088             return cloneReset();
1089         } catch (CloneNotSupportedException ex) {
1090             return null;
1091         }
1092     }
1093 
1094     /**
1095      * Creates a new instance of this Tokenizer. The new instance is reset so that
1096      * it will be at the start of the token list.
1097      * 
1098      * @return a new instance of this Tokenizer which has been reset.
1099      * @throws CloneNotSupportedException if there is a problem cloning
1100      */
1101     Object cloneReset() throws CloneNotSupportedException {
1102         // this method exists to enable 100% test coverage
1103         StrTokenizer cloned = (StrTokenizer) super.clone();
1104         if (cloned.chars != null) {
1105             cloned.chars = (char[]) cloned.chars.clone();
1106         }
1107         cloned.reset();
1108         return cloned;
1109     }
1110 
1111     //-----------------------------------------------------------------------
1112     /**
1113      * Gets the String content that the tokenizer is parsing.
1114      *
1115      * @return the string content being parsed
1116      */
1117     public String toString() {
1118         if (tokens == null) {
1119             return "StrTokenizer[not tokenized yet]";
1120         }
1121         return "StrTokenizer" + getTokenList();
1122     }
1123 
1124 }