View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  import org.apache.commons.lang3.ArrayUtils;
26  import org.apache.commons.lang3.StringUtils;
27  
28  /**
29   * Tokenizes a string based based on delimiters (separators)
30   * and supporting quoting and ignored character concepts.
31   * <p>
32   * This class can split a String into many smaller strings. It aims
33   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34   * however it offers much more control and flexibility including implementing
35   * the <code>ListIterator</code> interface. By default, it is set up
36   * like <code>StringTokenizer</code>.
37   * <p>
38   * The input String is split into a number of <i>tokens</i>.
39   * Each token is separated from the next String by a <i>delimiter</i>.
40   * One or more delimiter characters must be specified.
41   * <p>
42   * Each token may be surrounded by quotes.
43   * The <i>quote</i> matcher specifies the quote character(s).
44   * A quote may be escaped within a quoted section by duplicating itself.
45   * <p>
46   * Between each token and the delimiter are potentially characters that need trimming.
47   * The <i>trimmer</i> matcher specifies these characters.
48   * One usage might be to trim whitespace characters.
49   * <p>
50   * At any point outside the quotes there might potentially be invalid characters.
51   * The <i>ignored</i> matcher specifies these characters to be removed.
52   * One usage might be to remove new line characters.
53   * <p>
54   * Empty tokens may be removed or returned as null.
55   * <pre>
56   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
57   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
58   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59   * </pre>
60   * <p>
61   *
62   * This tokenizer has the following properties and options:
63   *
64   * <table>
65   *  <tr>
66   *   <th>Property</th><th>Type</th><th>Default</th>
67   *  </tr>
68   *  <tr>
69   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
70   *  </tr>
71   *  <tr>
72   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
73   *  </tr>
74   *  <tr>
75   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
76   *  </tr>
77   *  <tr>
78   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
79   *  </tr>
80   *  <tr>
81   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
82   *  </tr>
83   * </table>
84   *
85   * @since 2.2
86   * @version $Id: StrTokenizer.java 1436770 2013-01-22 07:09:45Z ggregory $
87   */
88  public class StrTokenizer implements ListIterator<String>, Cloneable {
89  
90      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
91      private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
92      static {
93          CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
94          CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
95          CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
96          CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
97          CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
98          CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
99          CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100 
101         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108     }
109 
110     /** The text to work on. */
111     private char chars[];
112     /** The parsed tokens */
113     private String tokens[];
114     /** The current iteration position */
115     private int tokenPos;
116 
117     /** The delimiter matcher */
118     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119     /** The quote matcher */
120     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121     /** The ignored matcher */
122     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123     /** The trimmer matcher */
124     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125 
126     /** Whether to return empty tokens as null */
127     private boolean emptyAsNull = false;
128     /** Whether to ignore empty tokens */
129     private boolean ignoreEmptyTokens = true;
130 
131     //-----------------------------------------------------------------------
132 
133     /**
134      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135      * 
136      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137      */
138     private static StrTokenizer getCSVClone() {
139         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140     }
141 
142     /**
143      * Gets a new tokenizer instance which parses Comma Separated Value strings
144      * initializing it with the given input.  The default for CSV processing
145      * will be trim whitespace from both ends (which can be overridden with
146      * the setTrimmer method).
147      * <p>
148      * You must call a "reset" method to set the string which you want to parse.
149      * @return a new tokenizer instance which parses Comma Separated Value strings
150      */
151     public static StrTokenizer getCSVInstance() {
152         return getCSVClone();
153     }
154 
155     /**
156      * Gets a new tokenizer instance which parses Comma Separated Value strings
157      * initializing it with the given input.  The default for CSV processing
158      * will be trim whitespace from both ends (which can be overridden with
159      * the setTrimmer method).
160      *
161      * @param input  the text to parse
162      * @return a new tokenizer instance which parses Comma Separated Value strings
163      */
164     public static StrTokenizer getCSVInstance(final String input) {
165         final StrTokenizer tok = getCSVClone();
166         tok.reset(input);
167         return tok;
168     }
169 
170     /**
171      * Gets a new tokenizer instance which parses Comma Separated Value strings
172      * initializing it with the given input.  The default for CSV processing
173      * will be trim whitespace from both ends (which can be overridden with
174      * the setTrimmer method).
175      *
176      * @param input  the text to parse
177      * @return a new tokenizer instance which parses Comma Separated Value strings
178      */
179     public static StrTokenizer getCSVInstance(final char[] input) {
180         final StrTokenizer tok = getCSVClone();
181         tok.reset(input);
182         return tok;
183     }
184 
185     /**
186      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187      * 
188      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189      */
190     private static StrTokenizer getTSVClone() {
191         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192     }
193 
194 
195     /**
196      * Gets a new tokenizer instance which parses Tab Separated Value strings.
197      * The default for CSV processing will be trim whitespace from both ends
198      * (which can be overridden with the setTrimmer method).
199      * <p>
200      * You must call a "reset" method to set the string which you want to parse.
201      * @return a new tokenizer instance which parses Tab Separated Value strings.
202      */
203     public static StrTokenizer getTSVInstance() {
204         return getTSVClone();
205     }
206 
207     /**
208      * Gets a new tokenizer instance which parses Tab Separated Value strings.
209      * The default for CSV processing will be trim whitespace from both ends
210      * (which can be overridden with the setTrimmer method).
211      * @param input  the string to parse
212      * @return a new tokenizer instance which parses Tab Separated Value strings.
213      */
214     public static StrTokenizer getTSVInstance(final String input) {
215         final StrTokenizer tok = getTSVClone();
216         tok.reset(input);
217         return tok;
218     }
219 
220     /**
221      * Gets a new tokenizer instance which parses Tab Separated Value strings.
222      * The default for CSV processing will be trim whitespace from both ends
223      * (which can be overridden with the setTrimmer method).
224      * @param input  the string to parse
225      * @return a new tokenizer instance which parses Tab Separated Value strings.
226      */
227     public static StrTokenizer getTSVInstance(final char[] input) {
228         final StrTokenizer tok = getTSVClone();
229         tok.reset(input);
230         return tok;
231     }
232 
233     //-----------------------------------------------------------------------
234     /**
235      * Constructs a tokenizer splitting on space, tab, newline and formfeed
236      * as per StringTokenizer, but with no text to tokenize.
237      * <p>
238      * This constructor is normally used with {@link #reset(String)}.
239      */
240     public StrTokenizer() {
241         super();
242         this.chars = null;
243     }
244 
245     /**
246      * Constructs a tokenizer splitting on space, tab, newline and formfeed
247      * as per StringTokenizer.
248      *
249      * @param input  the string which is to be parsed
250      */
251     public StrTokenizer(final String input) {
252         super();
253         if (input != null) {
254             chars = input.toCharArray();
255         } else {
256             chars = null;
257         }
258     }
259 
260     /**
261      * Constructs a tokenizer splitting on the specified delimiter character.
262      *
263      * @param input  the string which is to be parsed
264      * @param delim  the field delimiter character
265      */
266     public StrTokenizer(final String input, final char delim) {
267         this(input);
268         setDelimiterChar(delim);
269     }
270 
271     /**
272      * Constructs a tokenizer splitting on the specified delimiter string.
273      *
274      * @param input  the string which is to be parsed
275      * @param delim  the field delimiter string
276      */
277     public StrTokenizer(final String input, final String delim) {
278         this(input);
279         setDelimiterString(delim);
280     }
281 
282     /**
283      * Constructs a tokenizer splitting using the specified delimiter matcher.
284      *
285      * @param input  the string which is to be parsed
286      * @param delim  the field delimiter matcher
287      */
288     public StrTokenizer(final String input, final StrMatcher delim) {
289         this(input);
290         setDelimiterMatcher(delim);
291     }
292 
293     /**
294      * Constructs a tokenizer splitting on the specified delimiter character
295      * and handling quotes using the specified quote character.
296      *
297      * @param input  the string which is to be parsed
298      * @param delim  the field delimiter character
299      * @param quote  the field quoted string character
300      */
301     public StrTokenizer(final String input, final char delim, final char quote) {
302         this(input, delim);
303         setQuoteChar(quote);
304     }
305 
306     /**
307      * Constructs a tokenizer splitting using the specified delimiter matcher
308      * and handling quotes using the specified quote matcher.
309      *
310      * @param input  the string which is to be parsed
311      * @param delim  the field delimiter matcher
312      * @param quote  the field quoted string matcher
313      */
314     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
315         this(input, delim);
316         setQuoteMatcher(quote);
317     }
318 
319     /**
320      * Constructs a tokenizer splitting on space, tab, newline and formfeed
321      * as per StringTokenizer.
322      *
323      * @param input  the string which is to be parsed, not cloned
324      */
325     public StrTokenizer(final char[] input) {
326         super();
327         this.chars = ArrayUtils.clone(input);
328     }
329 
330     /**
331      * Constructs a tokenizer splitting on the specified character.
332      *
333      * @param input  the string which is to be parsed, not cloned
334      * @param delim the field delimiter character
335      */
336     public StrTokenizer(final char[] input, final char delim) {
337         this(input);
338         setDelimiterChar(delim);
339     }
340 
341     /**
342      * Constructs a tokenizer splitting on the specified string.
343      *
344      * @param input  the string which is to be parsed, not cloned
345      * @param delim the field delimiter string
346      */
347     public StrTokenizer(final char[] input, final String delim) {
348         this(input);
349         setDelimiterString(delim);
350     }
351 
352     /**
353      * Constructs a tokenizer splitting using the specified delimiter matcher.
354      *
355      * @param input  the string which is to be parsed, not cloned
356      * @param delim  the field delimiter matcher
357      */
358     public StrTokenizer(final char[] input, final StrMatcher delim) {
359         this(input);
360         setDelimiterMatcher(delim);
361     }
362 
363     /**
364      * Constructs a tokenizer splitting on the specified delimiter character
365      * and handling quotes using the specified quote character.
366      *
367      * @param input  the string which is to be parsed, not cloned
368      * @param delim  the field delimiter character
369      * @param quote  the field quoted string character
370      */
371     public StrTokenizer(final char[] input, final char delim, final char quote) {
372         this(input, delim);
373         setQuoteChar(quote);
374     }
375 
376     /**
377      * Constructs a tokenizer splitting using the specified delimiter matcher
378      * and handling quotes using the specified quote matcher.
379      *
380      * @param input  the string which is to be parsed, not cloned
381      * @param delim  the field delimiter character
382      * @param quote  the field quoted string character
383      */
384     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
385         this(input, delim);
386         setQuoteMatcher(quote);
387     }
388 
389     // API
390     //-----------------------------------------------------------------------
391     /**
392      * Gets the number of tokens found in the String.
393      *
394      * @return the number of matched tokens
395      */
396     public int size() {
397         checkTokenized();
398         return tokens.length;
399     }
400 
401     /**
402      * Gets the next token from the String.
403      * Equivalent to {@link #next()} except it returns null rather than
404      * throwing {@link NoSuchElementException} when no tokens remain.
405      *
406      * @return the next sequential token, or null when no more tokens are found
407      */
408     public String nextToken() {
409         if (hasNext()) {
410             return tokens[tokenPos++];
411         }
412         return null;
413     }
414 
415     /**
416      * Gets the previous token from the String.
417      *
418      * @return the previous sequential token, or null when no more tokens are found
419      */
420     public String previousToken() {
421         if (hasPrevious()) {
422             return tokens[--tokenPos];
423         }
424         return null;
425     }
426 
427     /**
428      * Gets a copy of the full token list as an independent modifiable array.
429      *
430      * @return the tokens as a String array
431      */
432     public String[] getTokenArray() {
433         checkTokenized();
434         return tokens.clone();
435     }
436 
437     /**
438      * Gets a copy of the full token list as an independent modifiable list.
439      *
440      * @return the tokens as a String array
441      */
442     public List<String> getTokenList() {
443         checkTokenized();
444         final List<String> list = new ArrayList<String>(tokens.length);
445         for (final String element : tokens) {
446             list.add(element);
447         }
448         return list;
449     }
450 
451     /**
452      * Resets this tokenizer, forgetting all parsing and iteration already completed.
453      * <p>
454      * This method allows the same tokenizer to be reused for the same String.
455      *
456      * @return this, to enable chaining
457      */
458     public StrTokenizer reset() {
459         tokenPos = 0;
460         tokens = null;
461         return this;
462     }
463 
464     /**
465      * Reset this tokenizer, giving it a new input string to parse.
466      * In this manner you can re-use a tokenizer with the same settings
467      * on multiple input lines.
468      *
469      * @param input  the new string to tokenize, null sets no text to parse
470      * @return this, to enable chaining
471      */
472     public StrTokenizer reset(final String input) {
473         reset();
474         if (input != null) {
475             this.chars = input.toCharArray();
476         } else {
477             this.chars = null;
478         }
479         return this;
480     }
481 
482     /**
483      * Reset this tokenizer, giving it a new input string to parse.
484      * In this manner you can re-use a tokenizer with the same settings
485      * on multiple input lines.
486      *
487      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
488      * @return this, to enable chaining
489      */
490     public StrTokenizer reset(final char[] input) {
491         reset();
492         this.chars = ArrayUtils.clone(input);
493         return this;
494     }
495 
496     // ListIterator
497     //-----------------------------------------------------------------------
498     /**
499      * Checks whether there are any more tokens.
500      *
501      * @return true if there are more tokens
502      */
503     @Override
504     public boolean hasNext() {
505         checkTokenized();
506         return tokenPos < tokens.length;
507     }
508 
509     /**
510      * Gets the next token.
511      *
512      * @return the next String token
513      * @throws NoSuchElementException if there are no more elements
514      */
515     @Override
516     public String next() {
517         if (hasNext()) {
518             return tokens[tokenPos++];
519         }
520         throw new NoSuchElementException();
521     }
522 
523     /**
524      * Gets the index of the next token to return.
525      *
526      * @return the next token index
527      */
528     @Override
529     public int nextIndex() {
530         return tokenPos;
531     }
532 
533     /**
534      * Checks whether there are any previous tokens that can be iterated to.
535      *
536      * @return true if there are previous tokens
537      */
538     @Override
539     public boolean hasPrevious() {
540         checkTokenized();
541         return tokenPos > 0;
542     }
543 
544     /**
545      * Gets the token previous to the last returned token.
546      *
547      * @return the previous token
548      */
549     @Override
550     public String previous() {
551         if (hasPrevious()) {
552             return tokens[--tokenPos];
553         }
554         throw new NoSuchElementException();
555     }
556 
557     /**
558      * Gets the index of the previous token.
559      *
560      * @return the previous token index
561      */
562     @Override
563     public int previousIndex() {
564         return tokenPos - 1;
565     }
566 
567     /**
568      * Unsupported ListIterator operation.
569      *
570      * @throws UnsupportedOperationException always
571      */
572     @Override
573     public void remove() {
574         throw new UnsupportedOperationException("remove() is unsupported");
575     }
576 
577     /**
578      * Unsupported ListIterator operation.
579      * @param obj this parameter ignored.
580      * @throws UnsupportedOperationException always
581      */
582     @Override
583     public void set(final String obj) {
584         throw new UnsupportedOperationException("set() is unsupported");
585     }
586 
587     /**
588      * Unsupported ListIterator operation.
589      * @param obj this parameter ignored.
590      * @throws UnsupportedOperationException always
591      */
592     @Override
593     public void add(final String obj) {
594         throw new UnsupportedOperationException("add() is unsupported");
595     }
596 
597     // Implementation
598     //-----------------------------------------------------------------------
599     /**
600      * Checks if tokenization has been done, and if not then do it.
601      */
602     private void checkTokenized() {
603         if (tokens == null) {
604             if (chars == null) {
605                 // still call tokenize as subclass may do some work
606                 final List<String> split = tokenize(null, 0, 0);
607                 tokens = split.toArray(new String[split.size()]);
608             } else {
609                 final List<String> split = tokenize(chars, 0, chars.length);
610                 tokens = split.toArray(new String[split.size()]);
611             }
612         }
613     }
614 
615     /**
616      * Internal method to performs the tokenization.
617      * <p>
618      * Most users of this class do not need to call this method. This method
619      * will be called automatically by other (public) methods when required.
620      * <p>
621      * This method exists to allow subclasses to add code before or after the
622      * tokenization. For example, a subclass could alter the character array,
623      * offset or count to be parsed, or call the tokenizer multiple times on
624      * multiple strings. It is also be possible to filter the results.
625      * <p>
626      * <code>StrTokenizer</code> will always pass a zero offset and a count
627      * equal to the length of the array to this method, however a subclass
628      * may pass other values, or even an entirely different array.
629      * 
630      * @param chars  the character array being tokenized, may be null
631      * @param offset  the start position within the character array, must be valid
632      * @param count  the number of characters to tokenize, must be valid
633      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
634      */
635     protected List<String> tokenize(final char[] chars, final int offset, final int count) {
636         if (chars == null || count == 0) {
637             return Collections.emptyList();
638         }
639         final StrBuilder buf = new StrBuilder();
640         final List<String> tokens = new ArrayList<String>();
641         int pos = offset;
642         
643         // loop around the entire buffer
644         while (pos >= 0 && pos < count) {
645             // find next token
646             pos = readNextToken(chars, pos, count, buf, tokens);
647             
648             // handle case where end of string is a delimiter
649             if (pos >= count) {
650                 addToken(tokens, "");
651             }
652         }
653         return tokens;
654     }
655 
656     /**
657      * Adds a token to a list, paying attention to the parameters we've set.
658      *
659      * @param list  the list to add to
660      * @param tok  the token to add
661      */
662     private void addToken(final List<String> list, String tok) {
663         if (StringUtils.isEmpty(tok)) {
664             if (isIgnoreEmptyTokens()) {
665                 return;
666             }
667             if (isEmptyTokenAsNull()) {
668                 tok = null;
669             }
670         }
671         list.add(tok);
672     }
673 
674     /**
675      * Reads character by character through the String to get the next token.
676      *
677      * @param chars  the character array being tokenized
678      * @param start  the first character of field
679      * @param len  the length of the character array being tokenized
680      * @param workArea  a temporary work area
681      * @param tokens  the list of parsed tokens
682      * @return the starting position of the next field (the character
683      *  immediately after the delimiter), or -1 if end of string found
684      */
685     private int readNextToken(final char[] chars, int start, final int len, final StrBuilder workArea, final List<String> tokens) {
686         // skip all leading whitespace, unless it is the
687         // field delimiter or the quote character
688         while (start < len) {
689             final int removeLen = Math.max(
690                     getIgnoredMatcher().isMatch(chars, start, start, len),
691                     getTrimmerMatcher().isMatch(chars, start, start, len));
692             if (removeLen == 0 ||
693                 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
694                 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
695                 break;
696             }
697             start += removeLen;
698         }
699         
700         // handle reaching end
701         if (start >= len) {
702             addToken(tokens, "");
703             return -1;
704         }
705         
706         // handle empty token
707         final int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
708         if (delimLen > 0) {
709             addToken(tokens, "");
710             return start + delimLen;
711         }
712         
713         // handle found token
714         final int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
715         if (quoteLen > 0) {
716             return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
717         }
718         return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
719     }
720 
721     /**
722      * Reads a possibly quoted string token.
723      *
724      * @param chars  the character array being tokenized
725      * @param start  the first character of field
726      * @param len  the length of the character array being tokenized
727      * @param workArea  a temporary work area
728      * @param tokens  the list of parsed tokens
729      * @param quoteStart  the start position of the matched quote, 0 if no quoting
730      * @param quoteLen  the length of the matched quote, 0 if no quoting
731      * @return the starting position of the next field (the character
732      *  immediately after the delimiter, or if end of string found,
733      *  then the length of string
734      */
735     private int readWithQuotes(final char[] chars, final int start, final int len, final StrBuilder workArea, 
736                                final List<String> tokens, final int quoteStart, final int quoteLen) {
737         // Loop until we've found the end of the quoted
738         // string or the end of the input
739         workArea.clear();
740         int pos = start;
741         boolean quoting = quoteLen > 0;
742         int trimStart = 0;
743         
744         while (pos < len) {
745             // quoting mode can occur several times throughout a string
746             // we must switch between quoting and non-quoting until we
747             // encounter a non-quoted delimiter, or end of string
748             if (quoting) {
749                 // In quoting mode
750                 
751                 // If we've found a quote character, see if it's
752                 // followed by a second quote.  If so, then we need
753                 // to actually put the quote character into the token
754                 // rather than end the token.
755                 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
756                     if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
757                         // matched pair of quotes, thus an escaped quote
758                         workArea.append(chars, pos, quoteLen);
759                         pos += quoteLen * 2;
760                         trimStart = workArea.size();
761                         continue;
762                     }
763                     
764                     // end of quoting
765                     quoting = false;
766                     pos += quoteLen;
767                     continue;
768                 }
769                 
770                 // copy regular character from inside quotes
771                 workArea.append(chars[pos++]);
772                 trimStart = workArea.size();
773                 
774             } else {
775                 // Not in quoting mode
776                 
777                 // check for delimiter, and thus end of token
778                 final int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
779                 if (delimLen > 0) {
780                     // return condition when end of token found
781                     addToken(tokens, workArea.substring(0, trimStart));
782                     return pos + delimLen;
783                 }
784                 
785                 // check for quote, and thus back into quoting mode
786                 if (quoteLen > 0 && isQuote(chars, pos, len, quoteStart, quoteLen)) {
787                     quoting = true;
788                     pos += quoteLen;
789                     continue;
790                 }
791                 
792                 // check for ignored (outside quotes), and ignore
793                 final int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
794                 if (ignoredLen > 0) {
795                     pos += ignoredLen;
796                     continue;
797                 }
798                 
799                 // check for trimmed character
800                 // don't yet know if its at the end, so copy to workArea
801                 // use trimStart to keep track of trim at the end
802                 final int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
803                 if (trimmedLen > 0) {
804                     workArea.append(chars, pos, trimmedLen);
805                     pos += trimmedLen;
806                     continue;
807                 }
808                 
809                 // copy regular character from outside quotes
810                 workArea.append(chars[pos++]);
811                 trimStart = workArea.size();
812             }
813         }
814         
815         // return condition when end of string found
816         addToken(tokens, workArea.substring(0, trimStart));
817         return -1;
818     }
819 
820     /**
821      * Checks if the characters at the index specified match the quote
822      * already matched in readNextToken().
823      *
824      * @param chars  the character array being tokenized
825      * @param pos  the position to check for a quote
826      * @param len  the length of the character array being tokenized
827      * @param quoteStart  the start position of the matched quote, 0 if no quoting
828      * @param quoteLen  the length of the matched quote, 0 if no quoting
829      * @return true if a quote is matched
830      */
831     private boolean isQuote(final char[] chars, final int pos, final int len, final int quoteStart, final int quoteLen) {
832         for (int i = 0; i < quoteLen; i++) {
833             if (pos + i >= len || chars[pos + i] != chars[quoteStart + i]) {
834                 return false;
835             }
836         }
837         return true;
838     }
839 
840     // Delimiter
841     //-----------------------------------------------------------------------
842     /**
843      * Gets the field delimiter matcher.
844      *
845      * @return the delimiter matcher in use
846      */
847     public StrMatcher getDelimiterMatcher() {
848         return this.delimMatcher;
849     }
850 
851     /**
852      * Sets the field delimiter matcher.
853      * <p>
854      * The delimitier is used to separate one token from another.
855      *
856      * @param delim  the delimiter matcher to use
857      * @return this, to enable chaining
858      */
859     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
860         if (delim == null) {
861             this.delimMatcher = StrMatcher.noneMatcher();
862         } else {
863             this.delimMatcher = delim;
864         }
865         return this;
866     }
867 
868     /**
869      * Sets the field delimiter character.
870      *
871      * @param delim  the delimiter character to use
872      * @return this, to enable chaining
873      */
874     public StrTokenizer setDelimiterChar(final char delim) {
875         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
876     }
877 
878     /**
879      * Sets the field delimiter string.
880      *
881      * @param delim  the delimiter string to use
882      * @return this, to enable chaining
883      */
884     public StrTokenizer setDelimiterString(final String delim) {
885         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
886     }
887 
888     // Quote
889     //-----------------------------------------------------------------------
890     /**
891      * Gets the quote matcher currently in use.
892      * <p>
893      * The quote character is used to wrap data between the tokens.
894      * This enables delimiters to be entered as data.
895      * The default value is '"' (double quote).
896      *
897      * @return the quote matcher in use
898      */
899     public StrMatcher getQuoteMatcher() {
900         return quoteMatcher;
901     }
902 
903     /**
904      * Set the quote matcher to use.
905      * <p>
906      * The quote character is used to wrap data between the tokens.
907      * This enables delimiters to be entered as data.
908      *
909      * @param quote  the quote matcher to use, null ignored
910      * @return this, to enable chaining
911      */
912     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
913         if (quote != null) {
914             this.quoteMatcher = quote;
915         }
916         return this;
917     }
918 
919     /**
920      * Sets the quote character to use.
921      * <p>
922      * The quote character is used to wrap data between the tokens.
923      * This enables delimiters to be entered as data.
924      *
925      * @param quote  the quote character to use
926      * @return this, to enable chaining
927      */
928     public StrTokenizer setQuoteChar(final char quote) {
929         return setQuoteMatcher(StrMatcher.charMatcher(quote));
930     }
931 
932     // Ignored
933     //-----------------------------------------------------------------------
934     /**
935      * Gets the ignored character matcher.
936      * <p>
937      * These characters are ignored when parsing the String, unless they are
938      * within a quoted region.
939      * The default value is not to ignore anything.
940      *
941      * @return the ignored matcher in use
942      */
943     public StrMatcher getIgnoredMatcher() {
944         return ignoredMatcher;
945     }
946 
947     /**
948      * Set the matcher for characters to ignore.
949      * <p>
950      * These characters are ignored when parsing the String, unless they are
951      * within a quoted region.
952      *
953      * @param ignored  the ignored matcher to use, null ignored
954      * @return this, to enable chaining
955      */
956     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
957         if (ignored != null) {
958             this.ignoredMatcher = ignored;
959         }
960         return this;
961     }
962 
963     /**
964      * Set the character to ignore.
965      * <p>
966      * This character is ignored when parsing the String, unless it is
967      * within a quoted region.
968      *
969      * @param ignored  the ignored character to use
970      * @return this, to enable chaining
971      */
972     public StrTokenizer setIgnoredChar(final char ignored) {
973         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974     }
975 
976     // Trimmer
977     //-----------------------------------------------------------------------
978     /**
979      * Gets the trimmer character matcher.
980      * <p>
981      * These characters are trimmed off on each side of the delimiter
982      * until the token or quote is found.
983      * The default value is not to trim anything.
984      *
985      * @return the trimmer matcher in use
986      */
987     public StrMatcher getTrimmerMatcher() {
988         return trimmerMatcher;
989     }
990 
991     /**
992      * Sets the matcher for characters to trim.
993      * <p>
994      * These characters are trimmed off on each side of the delimiter
995      * until the token or quote is found.
996      *
997      * @param trimmer  the trimmer matcher to use, null ignored
998      * @return this, to enable chaining
999      */
1000     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1001         if (trimmer != null) {
1002             this.trimmerMatcher = trimmer;
1003         }
1004         return this;
1005     }
1006 
1007     //-----------------------------------------------------------------------
1008     /**
1009      * Gets whether the tokenizer currently returns empty tokens as null.
1010      * The default for this property is false.
1011      *
1012      * @return true if empty tokens are returned as null
1013      */
1014     public boolean isEmptyTokenAsNull() {
1015         return this.emptyAsNull;
1016     }
1017 
1018     /**
1019      * Sets whether the tokenizer should return empty tokens as null.
1020      * The default for this property is false.
1021      *
1022      * @param emptyAsNull  whether empty tokens are returned as null
1023      * @return this, to enable chaining
1024      */
1025     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1026         this.emptyAsNull = emptyAsNull;
1027         return this;
1028     }
1029 
1030     //-----------------------------------------------------------------------
1031     /**
1032      * Gets whether the tokenizer currently ignores empty tokens.
1033      * The default for this property is true.
1034      *
1035      * @return true if empty tokens are not returned
1036      */
1037     public boolean isIgnoreEmptyTokens() {
1038         return ignoreEmptyTokens;
1039     }
1040 
1041     /**
1042      * Sets whether the tokenizer should ignore and not return empty tokens.
1043      * The default for this property is true.
1044      *
1045      * @param ignoreEmptyTokens  whether empty tokens are not returned
1046      * @return this, to enable chaining
1047      */
1048     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1049         this.ignoreEmptyTokens = ignoreEmptyTokens;
1050         return this;
1051     }
1052 
1053     //-----------------------------------------------------------------------
1054     /**
1055      * Gets the String content that the tokenizer is parsing.
1056      *
1057      * @return the string content being parsed
1058      */
1059     public String getContent() {
1060         if (chars == null) {
1061             return null;
1062         }
1063         return new String(chars);
1064     }
1065 
1066     //-----------------------------------------------------------------------
1067     /**
1068      * Creates a new instance of this Tokenizer. The new instance is reset so
1069      * that it will be at the start of the token list.
1070      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1071      * 
1072      * @return a new instance of this Tokenizer which has been reset.
1073      */
1074     @Override
1075     public Object clone() {
1076         try {
1077             return cloneReset();
1078         } catch (final CloneNotSupportedException ex) {
1079             return null;
1080         }
1081     }
1082 
1083     /**
1084      * Creates a new instance of this Tokenizer. The new instance is reset so that
1085      * it will be at the start of the token list.
1086      * 
1087      * @return a new instance of this Tokenizer which has been reset.
1088      * @throws CloneNotSupportedException if there is a problem cloning
1089      */
1090     Object cloneReset() throws CloneNotSupportedException {
1091         // this method exists to enable 100% test coverage
1092         final StrTokenizer cloned = (StrTokenizer) super.clone();
1093         if (cloned.chars != null) {
1094             cloned.chars = cloned.chars.clone();
1095         }
1096         cloned.reset();
1097         return cloned;
1098     }
1099 
1100     //-----------------------------------------------------------------------
1101     /**
1102      * Gets the String content that the tokenizer is parsing.
1103      *
1104      * @return the string content being parsed
1105      */
1106     @Override
1107     public String toString() {
1108         if (tokens == null) {
1109             return "StrTokenizer[not tokenized yet]";
1110         }
1111         return "StrTokenizer" + getTokenList();
1112     }
1113 
1114 }