View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  /**
26   * Tokenizes a string based on delimiters (separators)
27   * and supporting quoting and ignored character concepts.
28   * <p>
29   * This class can split a String into many smaller strings. It aims
30   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
31   * however it offers much more control and flexibility including implementing
32   * the <code>ListIterator</code> interface. By default, it is set up
33   * like <code>StringTokenizer</code>.
34   * <p>
35   * The input String is split into a number of <i>tokens</i>.
36   * Each token is separated from the next String by a <i>delimiter</i>.
37   * One or more delimiter characters must be specified.
38   * <p>
39   * Each token may be surrounded by quotes.
40   * The <i>quote</i> matcher specifies the quote character(s).
41   * A quote may be escaped within a quoted section by duplicating itself.
42   * <p>
43   * Between each token and the delimiter are potentially characters that need trimming.
44   * The <i>trimmer</i> matcher specifies these characters.
45   * One usage might be to trim whitespace characters.
46   * <p>
47   * At any point outside the quotes there might potentially be invalid characters.
48   * The <i>ignored</i> matcher specifies these characters to be removed.
49   * One usage might be to remove new line characters.
50   * <p>
51   * Empty tokens may be removed or returned as null.
52   * <pre>
53   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
54   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
55   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56   * </pre>
57   *
58   * <table>
59   *  <caption>StrTokenizer properties and options</caption>
60   *  <tr>
61   *   <th>Property</th><th>Type</th><th>Default</th>
62   *  </tr>
63   *  <tr>
64   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
65   *  </tr>
66   *  <tr>
67   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
68   *  </tr>
69   *  <tr>
70   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
71   *  </tr>
72   *  <tr>
73   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
74   *  </tr>
75   *  <tr>
76   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
77   *  </tr>
78   * </table>
79   *
80   * @since 1.0
81   * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
82   */
83  @Deprecated
84  public class StrTokenizer implements ListIterator<String>, Cloneable {
85  
86      /** Comma separated values tokenizer internal variable. */
87      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
88      /** Tab separated values tokenizer internal variable. */
89      private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
90      static {
91          CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
92          CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
93          CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
94          CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
95          CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
96          CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
97          CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
98  
99          TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
100         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
101         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
102         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
103         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
104         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
105         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
106     }
107 
108     /** The text to work on. */
109     private char[] chars;
110     /** The parsed tokens. */
111     private String[] tokens;
112     /** The current iteration position. */
113     private int tokenPos;
114 
115     /** The delimiter matcher. */
116     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
117     /** The quote matcher. */
118     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
119     /** The ignored matcher. */
120     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
121     /** The trimmer matcher. */
122     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
123 
124     /** Whether to return empty tokens as null. */
125     private boolean emptyAsNull = false;
126     /** Whether to ignore empty tokens. */
127     private boolean ignoreEmptyTokens = true;
128 
129     //-----------------------------------------------------------------------
130 
131     /**
132      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
133      *
134      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135      */
136     private static StrTokenizer getCSVClone() {
137         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
138     }
139 
140     /**
141      * Gets a new tokenizer instance which parses Comma Separated Value strings
142      * initializing it with the given input.  The default for CSV processing
143      * will be trim whitespace from both ends (which can be overridden with
144      * the setTrimmer method).
145      * <p>
146      * You must call a "reset" method to set the string which you want to parse.
147      * @return a new tokenizer instance which parses Comma Separated Value strings
148      */
149     public static StrTokenizer getCSVInstance() {
150         return getCSVClone();
151     }
152 
153     /**
154      * Gets a new tokenizer instance which parses Comma Separated Value strings
155      * initializing it with the given input.  The default for CSV processing
156      * will be trim whitespace from both ends (which can be overridden with
157      * the setTrimmer method).
158      *
159      * @param input  the text to parse
160      * @return a new tokenizer instance which parses Comma Separated Value strings
161      */
162     public static StrTokenizer getCSVInstance(final String input) {
163         final StrTokenizer tok = getCSVClone();
164         tok.reset(input);
165         return tok;
166     }
167 
168     /**
169      * Gets a new tokenizer instance which parses Comma Separated Value strings
170      * initializing it with the given input.  The default for CSV processing
171      * will be trim whitespace from both ends (which can be overridden with
172      * the setTrimmer method).
173      *
174      * @param input  the text to parse
175      * @return a new tokenizer instance which parses Comma Separated Value strings
176      */
177     public static StrTokenizer getCSVInstance(final char[] input) {
178         final StrTokenizer tok = getCSVClone();
179         tok.reset(input);
180         return tok;
181     }
182 
183     /**
184      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
185      *
186      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187      */
188     private static StrTokenizer getTSVClone() {
189         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
190     }
191 
192 
193     /**
194      * Gets a new tokenizer instance which parses Tab Separated Value strings.
195      * The default for CSV processing will be trim whitespace from both ends
196      * (which can be overridden with the setTrimmer method).
197      * <p>
198      * You must call a "reset" method to set the string which you want to parse.
199      * @return a new tokenizer instance which parses Tab Separated Value strings.
200      */
201     public static StrTokenizer getTSVInstance() {
202         return getTSVClone();
203     }
204 
205     /**
206      * Gets a new tokenizer instance which parses Tab Separated Value strings.
207      * The default for CSV processing will be trim whitespace from both ends
208      * (which can be overridden with the setTrimmer method).
209      * @param input  the string to parse
210      * @return a new tokenizer instance which parses Tab Separated Value strings.
211      */
212     public static StrTokenizer getTSVInstance(final String input) {
213         final StrTokenizer tok = getTSVClone();
214         tok.reset(input);
215         return tok;
216     }
217 
218     /**
219      * Gets a new tokenizer instance which parses Tab Separated Value strings.
220      * The default for CSV processing will be trim whitespace from both ends
221      * (which can be overridden with the setTrimmer method).
222      * @param input  the string to parse
223      * @return a new tokenizer instance which parses Tab Separated Value strings.
224      */
225     public static StrTokenizer getTSVInstance(final char[] input) {
226         final StrTokenizer tok = getTSVClone();
227         tok.reset(input);
228         return tok;
229     }
230 
231     //-----------------------------------------------------------------------
232     /**
233      * Constructs a tokenizer splitting on space, tab, newline and form feed
234      * as per StringTokenizer, but with no text to tokenize.
235      * <p>
236      * This constructor is normally used with {@link #reset(String)}.
237      */
238     public StrTokenizer() {
239         super();
240         this.chars = null;
241     }
242 
243     /**
244      * Constructs a tokenizer splitting on space, tab, newline and form feed
245      * as per StringTokenizer.
246      *
247      * @param input  the string which is to be parsed
248      */
249     public StrTokenizer(final String input) {
250         super();
251         if (input != null) {
252             chars = input.toCharArray();
253         } else {
254             chars = null;
255         }
256     }
257 
258     /**
259      * Constructs a tokenizer splitting on the specified delimiter character.
260      *
261      * @param input  the string which is to be parsed
262      * @param delim  the field delimiter character
263      */
264     public StrTokenizer(final String input, final char delim) {
265         this(input);
266         setDelimiterChar(delim);
267     }
268 
269     /**
270      * Constructs a tokenizer splitting on the specified delimiter string.
271      *
272      * @param input  the string which is to be parsed
273      * @param delim  the field delimiter string
274      */
275     public StrTokenizer(final String input, final String delim) {
276         this(input);
277         setDelimiterString(delim);
278     }
279 
280     /**
281      * Constructs a tokenizer splitting using the specified delimiter matcher.
282      *
283      * @param input  the string which is to be parsed
284      * @param delim  the field delimiter matcher
285      */
286     public StrTokenizer(final String input, final StrMatcher delim) {
287         this(input);
288         setDelimiterMatcher(delim);
289     }
290 
291     /**
292      * Constructs a tokenizer splitting on the specified delimiter character
293      * and handling quotes using the specified quote character.
294      *
295      * @param input  the string which is to be parsed
296      * @param delim  the field delimiter character
297      * @param quote  the field quoted string character
298      */
299     public StrTokenizer(final String input, final char delim, final char quote) {
300         this(input, delim);
301         setQuoteChar(quote);
302     }
303 
304     /**
305      * Constructs a tokenizer splitting using the specified delimiter matcher
306      * and handling quotes using the specified quote matcher.
307      *
308      * @param input  the string which is to be parsed
309      * @param delim  the field delimiter matcher
310      * @param quote  the field quoted string matcher
311      */
312     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
313         this(input, delim);
314         setQuoteMatcher(quote);
315     }
316 
317     /**
318      * Constructs a tokenizer splitting on space, tab, newline and form feed
319      * as per StringTokenizer.
320      *
321      * @param input  the string which is to be parsed, not cloned
322      */
323     public StrTokenizer(final char[] input) {
324         super();
325         if (input == null) {
326             this.chars = null;
327         } else {
328             this.chars = input.clone();
329         }
330     }
331 
332     /**
333      * Constructs a tokenizer splitting on the specified character.
334      *
335      * @param input  the string which is to be parsed, not cloned
336      * @param delim the field delimiter character
337      */
338     public StrTokenizer(final char[] input, final char delim) {
339         this(input);
340         setDelimiterChar(delim);
341     }
342 
343     /**
344      * Constructs a tokenizer splitting on the specified string.
345      *
346      * @param input  the string which is to be parsed, not cloned
347      * @param delim the field delimiter string
348      */
349     public StrTokenizer(final char[] input, final String delim) {
350         this(input);
351         setDelimiterString(delim);
352     }
353 
354     /**
355      * Constructs a tokenizer splitting using the specified delimiter matcher.
356      *
357      * @param input  the string which is to be parsed, not cloned
358      * @param delim  the field delimiter matcher
359      */
360     public StrTokenizer(final char[] input, final StrMatcher delim) {
361         this(input);
362         setDelimiterMatcher(delim);
363     }
364 
365     /**
366      * Constructs a tokenizer splitting on the specified delimiter character
367      * and handling quotes using the specified quote character.
368      *
369      * @param input  the string which is to be parsed, not cloned
370      * @param delim  the field delimiter character
371      * @param quote  the field quoted string character
372      */
373     public StrTokenizer(final char[] input, final char delim, final char quote) {
374         this(input, delim);
375         setQuoteChar(quote);
376     }
377 
378     /**
379      * Constructs a tokenizer splitting using the specified delimiter matcher
380      * and handling quotes using the specified quote matcher.
381      *
382      * @param input  the string which is to be parsed, not cloned
383      * @param delim  the field delimiter character
384      * @param quote  the field quoted string character
385      */
386     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
387         this(input, delim);
388         setQuoteMatcher(quote);
389     }
390 
391     // API
392     //-----------------------------------------------------------------------
393     /**
394      * Gets the number of tokens found in the String.
395      *
396      * @return the number of matched tokens
397      */
398     public int size() {
399         checkTokenized();
400         return tokens.length;
401     }
402 
403     /**
404      * Gets the next token from the String.
405      * Equivalent to {@link #next()} except it returns null rather than
406      * throwing {@link NoSuchElementException} when no tokens remain.
407      *
408      * @return the next sequential token, or null when no more tokens are found
409      */
410     public String nextToken() {
411         if (hasNext()) {
412             return tokens[tokenPos++];
413         }
414         return null;
415     }
416 
417     /**
418      * Gets the previous token from the String.
419      *
420      * @return the previous sequential token, or null when no more tokens are found
421      */
422     public String previousToken() {
423         if (hasPrevious()) {
424             return tokens[--tokenPos];
425         }
426         return null;
427     }
428 
429     /**
430      * Gets a copy of the full token list as an independent modifiable array.
431      *
432      * @return the tokens as a String array
433      */
434     public String[] getTokenArray() {
435         checkTokenized();
436         return tokens.clone();
437     }
438 
439     /**
440      * Gets a copy of the full token list as an independent modifiable list.
441      *
442      * @return the tokens as a String array
443      */
444     public List<String> getTokenList() {
445         checkTokenized();
446         final List<String> list = new ArrayList<>(tokens.length);
447         Collections.addAll(list, tokens);
448 
449         return list;
450     }
451 
452     /**
453      * Resets this tokenizer, forgetting all parsing and iteration already completed.
454      * <p>
455      * This method allows the same tokenizer to be reused for the same String.
456      *
457      * @return this, to enable chaining
458      */
459     public StrTokenizer reset() {
460         tokenPos = 0;
461         tokens = null;
462         return this;
463     }
464 
465     /**
466      * Reset this tokenizer, giving it a new input string to parse.
467      * In this manner you can re-use a tokenizer with the same settings
468      * on multiple input lines.
469      *
470      * @param input  the new string to tokenize, null sets no text to parse
471      * @return this, to enable chaining
472      */
473     public StrTokenizer reset(final String input) {
474         reset();
475         if (input != null) {
476             this.chars = input.toCharArray();
477         } else {
478             this.chars = null;
479         }
480         return this;
481     }
482 
483     /**
484      * Reset this tokenizer, giving it a new input string to parse.
485      * In this manner you can re-use a tokenizer with the same settings
486      * on multiple input lines.
487      *
488      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
489      * @return this, to enable chaining
490      */
491     public StrTokenizer reset(final char[] input) {
492         reset();
493         if (input != null) {
494             this.chars = input.clone();
495         } else {
496             this.chars = null;
497         }
498         return this;
499     }
500 
501     // ListIterator
502     //-----------------------------------------------------------------------
503     /**
504      * Checks whether there are any more tokens.
505      *
506      * @return true if there are more tokens
507      */
508     @Override
509     public boolean hasNext() {
510         checkTokenized();
511         return tokenPos < tokens.length;
512     }
513 
514     /**
515      * Gets the next token.
516      *
517      * @return the next String token
518      * @throws NoSuchElementException if there are no more elements
519      */
520     @Override
521     public String next() {
522         if (hasNext()) {
523             return tokens[tokenPos++];
524         }
525         throw new NoSuchElementException();
526     }
527 
528     /**
529      * Gets the index of the next token to return.
530      *
531      * @return the next token index
532      */
533     @Override
534     public int nextIndex() {
535         return tokenPos;
536     }
537 
538     /**
539      * Checks whether there are any previous tokens that can be iterated to.
540      *
541      * @return true if there are previous tokens
542      */
543     @Override
544     public boolean hasPrevious() {
545         checkTokenized();
546         return tokenPos > 0;
547     }
548 
549     /**
550      * Gets the token previous to the last returned token.
551      *
552      * @return the previous token
553      */
554     @Override
555     public String previous() {
556         if (hasPrevious()) {
557             return tokens[--tokenPos];
558         }
559         throw new NoSuchElementException();
560     }
561 
562     /**
563      * Gets the index of the previous token.
564      *
565      * @return the previous token index
566      */
567     @Override
568     public int previousIndex() {
569         return tokenPos - 1;
570     }
571 
572     /**
573      * Unsupported ListIterator operation.
574      *
575      * @throws UnsupportedOperationException always
576      */
577     @Override
578     public void remove() {
579         throw new UnsupportedOperationException("remove() is unsupported");
580     }
581 
582     /**
583      * Unsupported ListIterator operation.
584      * @param obj this parameter ignored.
585      * @throws UnsupportedOperationException always
586      */
587     @Override
588     public void set(final String obj) {
589         throw new UnsupportedOperationException("set() is unsupported");
590     }
591 
592     /**
593      * Unsupported ListIterator operation.
594      * @param obj this parameter ignored.
595      * @throws UnsupportedOperationException always
596      */
597     @Override
598     public void add(final String obj) {
599         throw new UnsupportedOperationException("add() is unsupported");
600     }
601 
602     // Implementation
603     //-----------------------------------------------------------------------
604     /**
605      * Checks if tokenization has been done, and if not then do it.
606      */
607     private void checkTokenized() {
608         if (tokens == null) {
609             if (chars == null) {
610                 // still call tokenize as subclass may do some work
611                 final List<String> split = tokenize(null, 0, 0);
612                 tokens = split.toArray(new String[split.size()]);
613             } else {
614                 final List<String> split = tokenize(chars, 0, chars.length);
615                 tokens = split.toArray(new String[split.size()]);
616             }
617         }
618     }
619 
620     /**
621      * Internal method to performs the tokenization.
622      * <p>
623      * Most users of this class do not need to call this method. This method
624      * will be called automatically by other (public) methods when required.
625      * <p>
626      * This method exists to allow subclasses to add code before or after the
627      * tokenization. For example, a subclass could alter the character array,
628      * offset or count to be parsed, or call the tokenizer multiple times on
629      * multiple strings. It is also be possible to filter the results.
630      * <p>
631      * <code>StrTokenizer</code> will always pass a zero offset and a count
632      * equal to the length of the array to this method, however a subclass
633      * may pass other values, or even an entirely different array.
634      *
635      * @param srcChars  the character array being tokenized, may be null
636      * @param offset  the start position within the character array, must be valid
637      * @param count  the number of characters to tokenize, must be valid
638      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
639      */
640     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
641         if (srcChars == null || count == 0) {
642             return Collections.emptyList();
643         }
644         final StrBuilder buf = new StrBuilder();
645         final List<String> tokenList = new ArrayList<>();
646         int pos = offset;
647 
648         // loop around the entire buffer
649         while (pos >= 0 && pos < count) {
650             // find next token
651             pos = readNextToken(srcChars, pos, count, buf, tokenList);
652 
653             // handle case where end of string is a delimiter
654             if (pos >= count) {
655                 addToken(tokenList, "");
656             }
657         }
658         return tokenList;
659     }
660 
661     /**
662      * Adds a token to a list, paying attention to the parameters we've set.
663      *
664      * @param list  the list to add to
665      * @param tok  the token to add
666      */
667     private void addToken(final List<String> list, String tok) {
668         if (tok == null || tok.length() == 0) {
669             if (isIgnoreEmptyTokens()) {
670                 return;
671             }
672             if (isEmptyTokenAsNull()) {
673                 tok = null;
674             }
675         }
676         list.add(tok);
677     }
678 
679     /**
680      * Reads character by character through the String to get the next token.
681      *
682      * @param srcChars  the character array being tokenized
683      * @param start  the first character of field
684      * @param len  the length of the character array being tokenized
685      * @param workArea  a temporary work area
686      * @param tokenList  the list of parsed tokens
687      * @return the starting position of the next field (the character
688      *  immediately after the delimiter), or -1 if end of string found
689      */
690     private int readNextToken(final char[] srcChars,
691                               int start,
692                               final int len,
693                               final StrBuilder workArea,
694                               final List<String> tokenList) {
695         // skip all leading whitespace, unless it is the
696         // field delimiter or the quote character
697         while (start < len) {
698             final int removeLen = Math.max(
699                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
700                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
701             if (removeLen == 0
702                     || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
703                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
704                 break;
705             }
706             start += removeLen;
707         }
708 
709         // handle reaching end
710         if (start >= len) {
711             addToken(tokenList, "");
712             return -1;
713         }
714 
715         // handle empty token
716         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
717         if (delimLen > 0) {
718             addToken(tokenList, "");
719             return start + delimLen;
720         }
721 
722         // handle found token
723         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
724         if (quoteLen > 0) {
725             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
726         }
727         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
728     }
729 
730     /**
731      * Reads a possibly quoted string token.
732      *
733      * @param srcChars  the character array being tokenized
734      * @param start  the first character of field
735      * @param len  the length of the character array being tokenized
736      * @param workArea  a temporary work area
737      * @param tokenList  the list of parsed tokens
738      * @param quoteStart  the start position of the matched quote, 0 if no quoting
739      * @param quoteLen  the length of the matched quote, 0 if no quoting
740      * @return the starting position of the next field (the character
741      *  immediately after the delimiter, or if end of string found,
742      *  then the length of string
743      */
744     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
745                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
746         // Loop until we've found the end of the quoted
747         // string or the end of the input
748         workArea.clear();
749         int pos = start;
750         boolean quoting = quoteLen > 0;
751         int trimStart = 0;
752 
753         while (pos < len) {
754             // quoting mode can occur several times throughout a string
755             // we must switch between quoting and non-quoting until we
756             // encounter a non-quoted delimiter, or end of string
757             if (quoting) {
758                 // In quoting mode
759 
760                 // If we've found a quote character, see if it's
761                 // followed by a second quote.  If so, then we need
762                 // to actually put the quote character into the token
763                 // rather than end the token.
764                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
765                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
766                         // matched pair of quotes, thus an escaped quote
767                         workArea.append(srcChars, pos, quoteLen);
768                         pos += quoteLen * 2;
769                         trimStart = workArea.size();
770                         continue;
771                     }
772 
773                     // end of quoting
774                     quoting = false;
775                     pos += quoteLen;
776                     continue;
777                 }
778 
779                 // copy regular character from inside quotes
780                 workArea.append(srcChars[pos++]);
781                 trimStart = workArea.size();
782 
783             } else {
784                 // Not in quoting mode
785 
786                 // check for delimiter, and thus end of token
787                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
788                 if (delimLen > 0) {
789                     // return condition when end of token found
790                     addToken(tokenList, workArea.substring(0, trimStart));
791                     return pos + delimLen;
792                 }
793 
794                 // check for quote, and thus back into quoting mode
795                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
796                     quoting = true;
797                     pos += quoteLen;
798                     continue;
799                 }
800 
801                 // check for ignored (outside quotes), and ignore
802                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
803                 if (ignoredLen > 0) {
804                     pos += ignoredLen;
805                     continue;
806                 }
807 
808                 // check for trimmed character
809                 // don't yet know if its at the end, so copy to workArea
810                 // use trimStart to keep track of trim at the end
811                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
812                 if (trimmedLen > 0) {
813                     workArea.append(srcChars, pos, trimmedLen);
814                     pos += trimmedLen;
815                     continue;
816                 }
817 
818                 // copy regular character from outside quotes
819                 workArea.append(srcChars[pos++]);
820                 trimStart = workArea.size();
821             }
822         }
823 
824         // return condition when end of string found
825         addToken(tokenList, workArea.substring(0, trimStart));
826         return -1;
827     }
828 
829     /**
830      * Checks if the characters at the index specified match the quote
831      * already matched in readNextToken().
832      *
833      * @param srcChars  the character array being tokenized
834      * @param pos  the position to check for a quote
835      * @param len  the length of the character array being tokenized
836      * @param quoteStart  the start position of the matched quote, 0 if no quoting
837      * @param quoteLen  the length of the matched quote, 0 if no quoting
838      * @return true if a quote is matched
839      */
840     private boolean isQuote(final char[] srcChars,
841                             final int pos,
842                             final int len,
843                             final int quoteStart,
844                             final int quoteLen) {
845         for (int i = 0; i < quoteLen; i++) {
846             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
847                 return false;
848             }
849         }
850         return true;
851     }
852 
853     // Delimiter
854     //-----------------------------------------------------------------------
855     /**
856      * Gets the field delimiter matcher.
857      *
858      * @return the delimiter matcher in use
859      */
860     public StrMatcher getDelimiterMatcher() {
861         return this.delimMatcher;
862     }
863 
864     /**
865      * Sets the field delimiter matcher.
866      * <p>
867      * The delimiter is used to separate one token from another.
868      *
869      * @param delim  the delimiter matcher to use
870      * @return this, to enable chaining
871      */
872     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
873         if (delim == null) {
874             this.delimMatcher = StrMatcher.noneMatcher();
875         } else {
876             this.delimMatcher = delim;
877         }
878         return this;
879     }
880 
881     /**
882      * Sets the field delimiter character.
883      *
884      * @param delim  the delimiter character to use
885      * @return this, to enable chaining
886      */
887     public StrTokenizer setDelimiterChar(final char delim) {
888         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
889     }
890 
891     /**
892      * Sets the field delimiter string.
893      *
894      * @param delim  the delimiter string to use
895      * @return this, to enable chaining
896      */
897     public StrTokenizer setDelimiterString(final String delim) {
898         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
899     }
900 
901     // Quote
902     //-----------------------------------------------------------------------
903     /**
904      * Gets the quote matcher currently in use.
905      * <p>
906      * The quote character is used to wrap data between the tokens.
907      * This enables delimiters to be entered as data.
908      * The default value is '"' (double quote).
909      *
910      * @return the quote matcher in use
911      */
912     public StrMatcher getQuoteMatcher() {
913         return quoteMatcher;
914     }
915 
916     /**
917      * Set the quote matcher to use.
918      * <p>
919      * The quote character is used to wrap data between the tokens.
920      * This enables delimiters to be entered as data.
921      *
922      * @param quote  the quote matcher to use, null ignored
923      * @return this, to enable chaining
924      */
925     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
926         if (quote != null) {
927             this.quoteMatcher = quote;
928         }
929         return this;
930     }
931 
932     /**
933      * Sets the quote character to use.
934      * <p>
935      * The quote character is used to wrap data between the tokens.
936      * This enables delimiters to be entered as data.
937      *
938      * @param quote  the quote character to use
939      * @return this, to enable chaining
940      */
941     public StrTokenizer setQuoteChar(final char quote) {
942         return setQuoteMatcher(StrMatcher.charMatcher(quote));
943     }
944 
945     // Ignored
946     //-----------------------------------------------------------------------
947     /**
948      * Gets the ignored character matcher.
949      * <p>
950      * These characters are ignored when parsing the String, unless they are
951      * within a quoted region.
952      * The default value is not to ignore anything.
953      *
954      * @return the ignored matcher in use
955      */
956     public StrMatcher getIgnoredMatcher() {
957         return ignoredMatcher;
958     }
959 
960     /**
961      * Set the matcher for characters to ignore.
962      * <p>
963      * These characters are ignored when parsing the String, unless they are
964      * within a quoted region.
965      *
966      * @param ignored  the ignored matcher to use, null ignored
967      * @return this, to enable chaining
968      */
969     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
970         if (ignored != null) {
971             this.ignoredMatcher = ignored;
972         }
973         return this;
974     }
975 
976     /**
977      * Set the character to ignore.
978      * <p>
979      * This character is ignored when parsing the String, unless it is
980      * within a quoted region.
981      *
982      * @param ignored  the ignored character to use
983      * @return this, to enable chaining
984      */
985     public StrTokenizer setIgnoredChar(final char ignored) {
986         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
987     }
988 
989     // Trimmer
990     //-----------------------------------------------------------------------
991     /**
992      * Gets the trimmer character matcher.
993      * <p>
994      * These characters are trimmed off on each side of the delimiter
995      * until the token or quote is found.
996      * The default value is not to trim anything.
997      *
998      * @return the trimmer matcher in use
999      */
1000     public StrMatcher getTrimmerMatcher() {
1001         return trimmerMatcher;
1002     }
1003 
1004     /**
1005      * Sets the matcher for characters to trim.
1006      * <p>
1007      * These characters are trimmed off on each side of the delimiter
1008      * until the token or quote is found.
1009      *
1010      * @param trimmer  the trimmer matcher to use, null ignored
1011      * @return this, to enable chaining
1012      */
1013     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1014         if (trimmer != null) {
1015             this.trimmerMatcher = trimmer;
1016         }
1017         return this;
1018     }
1019 
1020     //-----------------------------------------------------------------------
1021     /**
1022      * Gets whether the tokenizer currently returns empty tokens as null.
1023      * The default for this property is false.
1024      *
1025      * @return true if empty tokens are returned as null
1026      */
1027     public boolean isEmptyTokenAsNull() {
1028         return this.emptyAsNull;
1029     }
1030 
1031     /**
1032      * Sets whether the tokenizer should return empty tokens as null.
1033      * The default for this property is false.
1034      *
1035      * @param emptyAsNull  whether empty tokens are returned as null
1036      * @return this, to enable chaining
1037      */
1038     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1039         this.emptyAsNull = emptyAsNull;
1040         return this;
1041     }
1042 
1043     //-----------------------------------------------------------------------
1044     /**
1045      * Gets whether the tokenizer currently ignores empty tokens.
1046      * The default for this property is true.
1047      *
1048      * @return true if empty tokens are not returned
1049      */
1050     public boolean isIgnoreEmptyTokens() {
1051         return ignoreEmptyTokens;
1052     }
1053 
1054     /**
1055      * Sets whether the tokenizer should ignore and not return empty tokens.
1056      * The default for this property is true.
1057      *
1058      * @param ignoreEmptyTokens  whether empty tokens are not returned
1059      * @return this, to enable chaining
1060      */
1061     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1062         this.ignoreEmptyTokens = ignoreEmptyTokens;
1063         return this;
1064     }
1065 
1066     //-----------------------------------------------------------------------
1067     /**
1068      * Gets the String content that the tokenizer is parsing.
1069      *
1070      * @return the string content being parsed
1071      */
1072     public String getContent() {
1073         if (chars == null) {
1074             return null;
1075         }
1076         return new String(chars);
1077     }
1078 
1079     //-----------------------------------------------------------------------
1080     /**
1081      * Creates a new instance of this Tokenizer. The new instance is reset so
1082      * that it will be at the start of the token list.
1083      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1084      *
1085      * @return a new instance of this Tokenizer which has been reset.
1086      */
1087     @Override
1088     public Object clone() {
1089         try {
1090             return cloneReset();
1091         } catch (final CloneNotSupportedException ex) {
1092             return null;
1093         }
1094     }
1095 
1096     /**
1097      * Creates a new instance of this Tokenizer. The new instance is reset so that
1098      * it will be at the start of the token list.
1099      *
1100      * @return a new instance of this Tokenizer which has been reset.
1101      * @throws CloneNotSupportedException if there is a problem cloning
1102      */
1103     Object cloneReset() throws CloneNotSupportedException {
1104         // this method exists to enable 100% test coverage
1105         final StrTokenizer cloned = (StrTokenizer) super.clone();
1106         if (cloned.chars != null) {
1107             cloned.chars = cloned.chars.clone();
1108         }
1109         cloned.reset();
1110         return cloned;
1111     }
1112 
1113     //-----------------------------------------------------------------------
1114     /**
1115      * Gets the String content that the tokenizer is parsing.
1116      *
1117      * @return the string content being parsed
1118      */
1119     @Override
1120     public String toString() {
1121         if (tokens == null) {
1122             return "StrTokenizer[not tokenized yet]";
1123         }
1124         return "StrTokenizer" + getTokenList();
1125     }
1126 
1127 }