View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  /**
26   * Tokenizes a string based based on delimiters (separators)
27   * and supporting quoting and ignored character concepts.
28   * <p>
29   * This class can split a String into many smaller strings. It aims
30   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
31   * however it offers much more control and flexibility including implementing
32   * the <code>ListIterator</code> interface. By default, it is set up
33   * like <code>StringTokenizer</code>.
34   * <p>
35   * The input String is split into a number of <i>tokens</i>.
36   * Each token is separated from the next String by a <i>delimiter</i>.
37   * One or more delimiter characters must be specified.
38   * <p>
39   * Each token may be surrounded by quotes.
40   * The <i>quote</i> matcher specifies the quote character(s).
41   * A quote may be escaped within a quoted section by duplicating itself.
42   * <p>
43   * Between each token and the delimiter are potentially characters that need trimming.
44   * The <i>trimmer</i> matcher specifies these characters.
45   * One usage might be to trim whitespace characters.
46   * <p>
47   * At any point outside the quotes there might potentially be invalid characters.
48   * The <i>ignored</i> matcher specifies these characters to be removed.
49   * One usage might be to remove new line characters.
50   * <p>
51   * Empty tokens may be removed or returned as null.
52   * <pre>
53   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
54   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
55   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56   * </pre>
57   * <p>
58   *
59   * This tokenizer has the following properties and options:
60   *
61   * <table summary="Tokenizer Properties">
62   *  <tr>
63   *   <th>Property</th><th>Type</th><th>Default</th>
64   *  </tr>
65   *  <tr>
66   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
67   *  </tr>
68   *  <tr>
69   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
70   *  </tr>
71   *  <tr>
72   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
73   *  </tr>
74   *  <tr>
75   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
76   *  </tr>
77   *  <tr>
78   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
79   *  </tr>
80   * </table>
81   *
82   * @since 1.0
83   */
84  public class StrTokenizer implements ListIterator<String>, Cloneable {
85  
86      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
87      private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
88      static {
89          CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
90          CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
91          CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
92          CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
93          CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
94          CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
95          CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
96  
97          TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
98          TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
99          TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
100         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
101         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
102         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
103         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
104     }
105 
106     /** The text to work on. */
107     private char chars[];
108     /** The parsed tokens */
109     private String tokens[];
110     /** The current iteration position */
111     private int tokenPos;
112 
113     /** The delimiter matcher */
114     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
115     /** The quote matcher */
116     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
117     /** The ignored matcher */
118     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
119     /** The trimmer matcher */
120     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
121 
122     /** Whether to return empty tokens as null */
123     private boolean emptyAsNull = false;
124     /** Whether to ignore empty tokens */
125     private boolean ignoreEmptyTokens = true;
126 
127     //-----------------------------------------------------------------------
128 
129     /**
130      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
131      *
132      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
133      */
134     private static StrTokenizer getCSVClone() {
135         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
136     }
137 
138     /**
139      * Gets a new tokenizer instance which parses Comma Separated Value strings
140      * initializing it with the given input.  The default for CSV processing
141      * will be trim whitespace from both ends (which can be overridden with
142      * the setTrimmer method).
143      * <p>
144      * You must call a "reset" method to set the string which you want to parse.
145      * @return a new tokenizer instance which parses Comma Separated Value strings
146      */
147     public static StrTokenizer getCSVInstance() {
148         return getCSVClone();
149     }
150 
151     /**
152      * Gets a new tokenizer instance which parses Comma Separated Value strings
153      * initializing it with the given input.  The default for CSV processing
154      * will be trim whitespace from both ends (which can be overridden with
155      * the setTrimmer method).
156      *
157      * @param input  the text to parse
158      * @return a new tokenizer instance which parses Comma Separated Value strings
159      */
160     public static StrTokenizer getCSVInstance(final String input) {
161         final StrTokenizer tok = getCSVClone();
162         tok.reset(input);
163         return tok;
164     }
165 
166     /**
167      * Gets a new tokenizer instance which parses Comma Separated Value strings
168      * initializing it with the given input.  The default for CSV processing
169      * will be trim whitespace from both ends (which can be overridden with
170      * the setTrimmer method).
171      *
172      * @param input  the text to parse
173      * @return a new tokenizer instance which parses Comma Separated Value strings
174      */
175     public static StrTokenizer getCSVInstance(final char[] input) {
176         final StrTokenizer tok = getCSVClone();
177         tok.reset(input);
178         return tok;
179     }
180 
181     /**
182      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
183      *
184      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
185      */
186     private static StrTokenizer getTSVClone() {
187         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
188     }
189 
190 
191     /**
192      * Gets a new tokenizer instance which parses Tab Separated Value strings.
193      * The default for CSV processing will be trim whitespace from both ends
194      * (which can be overridden with the setTrimmer method).
195      * <p>
196      * You must call a "reset" method to set the string which you want to parse.
197      * @return a new tokenizer instance which parses Tab Separated Value strings.
198      */
199     public static StrTokenizer getTSVInstance() {
200         return getTSVClone();
201     }
202 
203     /**
204      * Gets a new tokenizer instance which parses Tab Separated Value strings.
205      * The default for CSV processing will be trim whitespace from both ends
206      * (which can be overridden with the setTrimmer method).
207      * @param input  the string to parse
208      * @return a new tokenizer instance which parses Tab Separated Value strings.
209      */
210     public static StrTokenizer getTSVInstance(final String input) {
211         final StrTokenizer tok = getTSVClone();
212         tok.reset(input);
213         return tok;
214     }
215 
216     /**
217      * Gets a new tokenizer instance which parses Tab Separated Value strings.
218      * The default for CSV processing will be trim whitespace from both ends
219      * (which can be overridden with the setTrimmer method).
220      * @param input  the string to parse
221      * @return a new tokenizer instance which parses Tab Separated Value strings.
222      */
223     public static StrTokenizer getTSVInstance(final char[] input) {
224         final StrTokenizer tok = getTSVClone();
225         tok.reset(input);
226         return tok;
227     }
228 
229     //-----------------------------------------------------------------------
230     /**
231      * Constructs a tokenizer splitting on space, tab, newline and formfeed
232      * as per StringTokenizer, but with no text to tokenize.
233      * <p>
234      * This constructor is normally used with {@link #reset(String)}.
235      */
236     public StrTokenizer() {
237         super();
238         this.chars = null;
239     }
240 
241     /**
242      * Constructs a tokenizer splitting on space, tab, newline and formfeed
243      * as per StringTokenizer.
244      *
245      * @param input  the string which is to be parsed
246      */
247     public StrTokenizer(final String input) {
248         super();
249         if (input != null) {
250             chars = input.toCharArray();
251         } else {
252             chars = null;
253         }
254     }
255 
256     /**
257      * Constructs a tokenizer splitting on the specified delimiter character.
258      *
259      * @param input  the string which is to be parsed
260      * @param delim  the field delimiter character
261      */
262     public StrTokenizer(final String input, final char delim) {
263         this(input);
264         setDelimiterChar(delim);
265     }
266 
267     /**
268      * Constructs a tokenizer splitting on the specified delimiter string.
269      *
270      * @param input  the string which is to be parsed
271      * @param delim  the field delimiter string
272      */
273     public StrTokenizer(final String input, final String delim) {
274         this(input);
275         setDelimiterString(delim);
276     }
277 
278     /**
279      * Constructs a tokenizer splitting using the specified delimiter matcher.
280      *
281      * @param input  the string which is to be parsed
282      * @param delim  the field delimiter matcher
283      */
284     public StrTokenizer(final String input, final StrMatcher delim) {
285         this(input);
286         setDelimiterMatcher(delim);
287     }
288 
289     /**
290      * Constructs a tokenizer splitting on the specified delimiter character
291      * and handling quotes using the specified quote character.
292      *
293      * @param input  the string which is to be parsed
294      * @param delim  the field delimiter character
295      * @param quote  the field quoted string character
296      */
297     public StrTokenizer(final String input, final char delim, final char quote) {
298         this(input, delim);
299         setQuoteChar(quote);
300     }
301 
302     /**
303      * Constructs a tokenizer splitting using the specified delimiter matcher
304      * and handling quotes using the specified quote matcher.
305      *
306      * @param input  the string which is to be parsed
307      * @param delim  the field delimiter matcher
308      * @param quote  the field quoted string matcher
309      */
310     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
311         this(input, delim);
312         setQuoteMatcher(quote);
313     }
314 
315     /**
316      * Constructs a tokenizer splitting on space, tab, newline and formfeed
317      * as per StringTokenizer.
318      *
319      * @param input  the string which is to be parsed, not cloned
320      */
321     public StrTokenizer(final char[] input) {
322         super();
323         if (input == null) {
324             this.chars = null;
325         } else {
326             this.chars = input.clone();
327         }
328     }
329 
330     /**
331      * Constructs a tokenizer splitting on the specified character.
332      *
333      * @param input  the string which is to be parsed, not cloned
334      * @param delim the field delimiter character
335      */
336     public StrTokenizer(final char[] input, final char delim) {
337         this(input);
338         setDelimiterChar(delim);
339     }
340 
341     /**
342      * Constructs a tokenizer splitting on the specified string.
343      *
344      * @param input  the string which is to be parsed, not cloned
345      * @param delim the field delimiter string
346      */
347     public StrTokenizer(final char[] input, final String delim) {
348         this(input);
349         setDelimiterString(delim);
350     }
351 
352     /**
353      * Constructs a tokenizer splitting using the specified delimiter matcher.
354      *
355      * @param input  the string which is to be parsed, not cloned
356      * @param delim  the field delimiter matcher
357      */
358     public StrTokenizer(final char[] input, final StrMatcher delim) {
359         this(input);
360         setDelimiterMatcher(delim);
361     }
362 
363     /**
364      * Constructs a tokenizer splitting on the specified delimiter character
365      * and handling quotes using the specified quote character.
366      *
367      * @param input  the string which is to be parsed, not cloned
368      * @param delim  the field delimiter character
369      * @param quote  the field quoted string character
370      */
371     public StrTokenizer(final char[] input, final char delim, final char quote) {
372         this(input, delim);
373         setQuoteChar(quote);
374     }
375 
376     /**
377      * Constructs a tokenizer splitting using the specified delimiter matcher
378      * and handling quotes using the specified quote matcher.
379      *
380      * @param input  the string which is to be parsed, not cloned
381      * @param delim  the field delimiter character
382      * @param quote  the field quoted string character
383      */
384     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
385         this(input, delim);
386         setQuoteMatcher(quote);
387     }
388 
389     // API
390     //-----------------------------------------------------------------------
391     /**
392      * Gets the number of tokens found in the String.
393      *
394      * @return the number of matched tokens
395      */
396     public int size() {
397         checkTokenized();
398         return tokens.length;
399     }
400 
401     /**
402      * Gets the next token from the String.
403      * Equivalent to {@link #next()} except it returns null rather than
404      * throwing {@link NoSuchElementException} when no tokens remain.
405      *
406      * @return the next sequential token, or null when no more tokens are found
407      */
408     public String nextToken() {
409         if (hasNext()) {
410             return tokens[tokenPos++];
411         }
412         return null;
413     }
414 
415     /**
416      * Gets the previous token from the String.
417      *
418      * @return the previous sequential token, or null when no more tokens are found
419      */
420     public String previousToken() {
421         if (hasPrevious()) {
422             return tokens[--tokenPos];
423         }
424         return null;
425     }
426 
427     /**
428      * Gets a copy of the full token list as an independent modifiable array.
429      *
430      * @return the tokens as a String array
431      */
432     public String[] getTokenArray() {
433         checkTokenized();
434         return tokens.clone();
435     }
436 
437     /**
438      * Gets a copy of the full token list as an independent modifiable list.
439      *
440      * @return the tokens as a String array
441      */
442     public List<String> getTokenList() {
443         checkTokenized();
444         final List<String> list = new ArrayList<>(tokens.length);
445         for (final String element : tokens) {
446             list.add(element);
447         }
448         return list;
449     }
450 
451     /**
452      * Resets this tokenizer, forgetting all parsing and iteration already completed.
453      * <p>
454      * This method allows the same tokenizer to be reused for the same String.
455      *
456      * @return this, to enable chaining
457      */
458     public org.apache.commons.text.StrTokenizer reset() {
459         tokenPos = 0;
460         tokens = null;
461         return this;
462     }
463 
464     /**
465      * Reset this tokenizer, giving it a new input string to parse.
466      * In this manner you can re-use a tokenizer with the same settings
467      * on multiple input lines.
468      *
469      * @param input  the new string to tokenize, null sets no text to parse
470      * @return this, to enable chaining
471      */
472     public org.apache.commons.text.StrTokenizer reset(final String input) {
473         reset();
474         if (input != null) {
475             this.chars = input.toCharArray();
476         } else {
477             this.chars = null;
478         }
479         return this;
480     }
481 
482     /**
483      * Reset this tokenizer, giving it a new input string to parse.
484      * In this manner you can re-use a tokenizer with the same settings
485      * on multiple input lines.
486      *
487      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
488      * @return this, to enable chaining
489      */
490     public org.apache.commons.text.StrTokenizer reset(final char[] input) {
491         reset();
492         if (input != null) {
493             this.chars = input;
494         } else {
495             this.chars = null;
496         }
497         return this;
498     }
499 
500     // ListIterator
501     //-----------------------------------------------------------------------
502     /**
503      * Checks whether there are any more tokens.
504      *
505      * @return true if there are more tokens
506      */
507     @Override
508     public boolean hasNext() {
509         checkTokenized();
510         return tokenPos < tokens.length;
511     }
512 
513     /**
514      * Gets the next token.
515      *
516      * @return the next String token
517      * @throws NoSuchElementException if there are no more elements
518      */
519     @Override
520     public String next() {
521         if (hasNext()) {
522             return tokens[tokenPos++];
523         }
524         throw new NoSuchElementException();
525     }
526 
527     /**
528      * Gets the index of the next token to return.
529      *
530      * @return the next token index
531      */
532     @Override
533     public int nextIndex() {
534         return tokenPos;
535     }
536 
537     /**
538      * Checks whether there are any previous tokens that can be iterated to.
539      *
540      * @return true if there are previous tokens
541      */
542     @Override
543     public boolean hasPrevious() {
544         checkTokenized();
545         return tokenPos > 0;
546     }
547 
548     /**
549      * Gets the token previous to the last returned token.
550      *
551      * @return the previous token
552      */
553     @Override
554     public String previous() {
555         if (hasPrevious()) {
556             return tokens[--tokenPos];
557         }
558         throw new NoSuchElementException();
559     }
560 
561     /**
562      * Gets the index of the previous token.
563      *
564      * @return the previous token index
565      */
566     @Override
567     public int previousIndex() {
568         return tokenPos - 1;
569     }
570 
571     /**
572      * Unsupported ListIterator operation.
573      *
574      * @throws UnsupportedOperationException always
575      */
576     @Override
577     public void remove() {
578         throw new UnsupportedOperationException("remove() is unsupported");
579     }
580 
581     /**
582      * Unsupported ListIterator operation.
583      * @param obj this parameter ignored.
584      * @throws UnsupportedOperationException always
585      */
586     @Override
587     public void set(final String obj) {
588         throw new UnsupportedOperationException("set() is unsupported");
589     }
590 
591     /**
592      * Unsupported ListIterator operation.
593      * @param obj this parameter ignored.
594      * @throws UnsupportedOperationException always
595      */
596     @Override
597     public void add(final String obj) {
598         throw new UnsupportedOperationException("add() is unsupported");
599     }
600 
601     // Implementation
602     //-----------------------------------------------------------------------
603     /**
604      * Checks if tokenization has been done, and if not then do it.
605      */
606     private void checkTokenized() {
607         if (tokens == null) {
608             if (chars == null) {
609                 // still call tokenize as subclass may do some work
610                 final List<String> split = tokenize(null, 0, 0);
611                 tokens = split.toArray(new String[split.size()]);
612             } else {
613                 final List<String> split = tokenize(chars, 0, chars.length);
614                 tokens = split.toArray(new String[split.size()]);
615             }
616         }
617     }
618 
619     /**
620      * Internal method to performs the tokenization.
621      * <p>
622      * Most users of this class do not need to call this method. This method
623      * will be called automatically by other (public) methods when required.
624      * <p>
625      * This method exists to allow subclasses to add code before or after the
626      * tokenization. For example, a subclass could alter the character array,
627      * offset or count to be parsed, or call the tokenizer multiple times on
628      * multiple strings. It is also be possible to filter the results.
629      * <p>
630      * <code>StrTokenizer</code> will always pass a zero offset and a count
631      * equal to the length of the array to this method, however a subclass
632      * may pass other values, or even an entirely different array.
633      *
634      * @param srcChars  the character array being tokenized, may be null
635      * @param offset  the start position within the character array, must be valid
636      * @param count  the number of characters to tokenize, must be valid
637      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
638      */
639     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
640         if (srcChars == null || count == 0) {
641             return Collections.emptyList();
642         }
643         final StrBuilder buf = new StrBuilder();
644         final List<String> tokenList = new ArrayList<>();
645         int pos = offset;
646 
647         // loop around the entire buffer
648         while (pos >= 0 && pos < count) {
649             // find next token
650             pos = readNextToken(srcChars, pos, count, buf, tokenList);
651 
652             // handle case where end of string is a delimiter
653             if (pos >= count) {
654                 addToken(tokenList, "");
655             }
656         }
657         return tokenList;
658     }
659 
660     /**
661      * Adds a token to a list, paying attention to the parameters we've set.
662      *
663      * @param list  the list to add to
664      * @param tok  the token to add
665      */
666     private void addToken(final List<String> list, String tok) {
667         if (tok == null || tok.length() == 0) {
668             if (isIgnoreEmptyTokens()) {
669                 return;
670             }
671             if (isEmptyTokenAsNull()) {
672                 tok = null;
673             }
674         }
675         list.add(tok);
676     }
677 
678     /**
679      * Reads character by character through the String to get the next token.
680      *
681      * @param srcChars  the character array being tokenized
682      * @param start  the first character of field
683      * @param len  the length of the character array being tokenized
684      * @param workArea  a temporary work area
685      * @param tokenList  the list of parsed tokens
686      * @return the starting position of the next field (the character
687      *  immediately after the delimiter), or -1 if end of string found
688      */
689     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
690         // skip all leading whitespace, unless it is the
691         // field delimiter or the quote character
692         while (start < len) {
693             final int removeLen = Math.max(
694                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
695                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
696             if (removeLen == 0 ||
697                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
698                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
699                 break;
700             }
701             start += removeLen;
702         }
703 
704         // handle reaching end
705         if (start >= len) {
706             addToken(tokenList, "");
707             return -1;
708         }
709 
710         // handle empty token
711         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
712         if (delimLen > 0) {
713             addToken(tokenList, "");
714             return start + delimLen;
715         }
716 
717         // handle found token
718         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
719         if (quoteLen > 0) {
720             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
721         }
722         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
723     }
724 
725     /**
726      * Reads a possibly quoted string token.
727      *
728      * @param srcChars  the character array being tokenized
729      * @param start  the first character of field
730      * @param len  the length of the character array being tokenized
731      * @param workArea  a temporary work area
732      * @param tokenList  the list of parsed tokens
733      * @param quoteStart  the start position of the matched quote, 0 if no quoting
734      * @param quoteLen  the length of the matched quote, 0 if no quoting
735      * @return the starting position of the next field (the character
736      *  immediately after the delimiter, or if end of string found,
737      *  then the length of string
738      */
739     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
740                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
741         // Loop until we've found the end of the quoted
742         // string or the end of the input
743         workArea.clear();
744         int pos = start;
745         boolean quoting = quoteLen > 0;
746         int trimStart = 0;
747 
748         while (pos < len) {
749             // quoting mode can occur several times throughout a string
750             // we must switch between quoting and non-quoting until we
751             // encounter a non-quoted delimiter, or end of string
752             if (quoting) {
753                 // In quoting mode
754 
755                 // If we've found a quote character, see if it's
756                 // followed by a second quote.  If so, then we need
757                 // to actually put the quote character into the token
758                 // rather than end the token.
759                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
760                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
761                         // matched pair of quotes, thus an escaped quote
762                         workArea.append(srcChars, pos, quoteLen);
763                         pos += quoteLen * 2;
764                         trimStart = workArea.size();
765                         continue;
766                     }
767 
768                     // end of quoting
769                     quoting = false;
770                     pos += quoteLen;
771                     continue;
772                 }
773 
774                 // copy regular character from inside quotes
775                 workArea.append(srcChars[pos++]);
776                 trimStart = workArea.size();
777 
778             } else {
779                 // Not in quoting mode
780 
781                 // check for delimiter, and thus end of token
782                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
783                 if (delimLen > 0) {
784                     // return condition when end of token found
785                     addToken(tokenList, workArea.substring(0, trimStart));
786                     return pos + delimLen;
787                 }
788 
789                 // check for quote, and thus back into quoting mode
790                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
791                     quoting = true;
792                     pos += quoteLen;
793                     continue;
794                 }
795 
796                 // check for ignored (outside quotes), and ignore
797                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
798                 if (ignoredLen > 0) {
799                     pos += ignoredLen;
800                     continue;
801                 }
802 
803                 // check for trimmed character
804                 // don't yet know if its at the end, so copy to workArea
805                 // use trimStart to keep track of trim at the end
806                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
807                 if (trimmedLen > 0) {
808                     workArea.append(srcChars, pos, trimmedLen);
809                     pos += trimmedLen;
810                     continue;
811                 }
812 
813                 // copy regular character from outside quotes
814                 workArea.append(srcChars[pos++]);
815                 trimStart = workArea.size();
816             }
817         }
818 
819         // return condition when end of string found
820         addToken(tokenList, workArea.substring(0, trimStart));
821         return -1;
822     }
823 
824     /**
825      * Checks if the characters at the index specified match the quote
826      * already matched in readNextToken().
827      *
828      * @param srcChars  the character array being tokenized
829      * @param pos  the position to check for a quote
830      * @param len  the length of the character array being tokenized
831      * @param quoteStart  the start position of the matched quote, 0 if no quoting
832      * @param quoteLen  the length of the matched quote, 0 if no quoting
833      * @return true if a quote is matched
834      */
835     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
836         for (int i = 0; i < quoteLen; i++) {
837             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
838                 return false;
839             }
840         }
841         return true;
842     }
843 
844     // Delimiter
845     //-----------------------------------------------------------------------
846     /**
847      * Gets the field delimiter matcher.
848      *
849      * @return the delimiter matcher in use
850      */
851     public StrMatcher getDelimiterMatcher() {
852         return this.delimMatcher;
853     }
854 
855     /**
856      * Sets the field delimiter matcher.
857      * <p>
858      * The delimitier is used to separate one token from another.
859      *
860      * @param delim  the delimiter matcher to use
861      * @return this, to enable chaining
862      */
863     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
864         if (delim == null) {
865             this.delimMatcher = StrMatcher.noneMatcher();
866         } else {
867             this.delimMatcher = delim;
868         }
869         return this;
870     }
871 
872     /**
873      * Sets the field delimiter character.
874      *
875      * @param delim  the delimiter character to use
876      * @return this, to enable chaining
877      */
878     public StrTokenizer setDelimiterChar(final char delim) {
879         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
880     }
881 
882     /**
883      * Sets the field delimiter string.
884      *
885      * @param delim  the delimiter string to use
886      * @return this, to enable chaining
887      */
888     public StrTokenizer setDelimiterString(final String delim) {
889         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
890     }
891 
892     // Quote
893     //-----------------------------------------------------------------------
894     /**
895      * Gets the quote matcher currently in use.
896      * <p>
897      * The quote character is used to wrap data between the tokens.
898      * This enables delimiters to be entered as data.
899      * The default value is '"' (double quote).
900      *
901      * @return the quote matcher in use
902      */
903     public StrMatcher getQuoteMatcher() {
904         return quoteMatcher;
905     }
906 
907     /**
908      * Set the quote matcher to use.
909      * <p>
910      * The quote character is used to wrap data between the tokens.
911      * This enables delimiters to be entered as data.
912      *
913      * @param quote  the quote matcher to use, null ignored
914      * @return this, to enable chaining
915      */
916     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
917         if (quote != null) {
918             this.quoteMatcher = quote;
919         }
920         return this;
921     }
922 
923     /**
924      * Sets the quote character to use.
925      * <p>
926      * The quote character is used to wrap data between the tokens.
927      * This enables delimiters to be entered as data.
928      *
929      * @param quote  the quote character to use
930      * @return this, to enable chaining
931      */
932     public StrTokenizer setQuoteChar(final char quote) {
933         return setQuoteMatcher(StrMatcher.charMatcher(quote));
934     }
935 
936     // Ignored
937     //-----------------------------------------------------------------------
938     /**
939      * Gets the ignored character matcher.
940      * <p>
941      * These characters are ignored when parsing the String, unless they are
942      * within a quoted region.
943      * The default value is not to ignore anything.
944      *
945      * @return the ignored matcher in use
946      */
947     public StrMatcher getIgnoredMatcher() {
948         return ignoredMatcher;
949     }
950 
951     /**
952      * Set the matcher for characters to ignore.
953      * <p>
954      * These characters are ignored when parsing the String, unless they are
955      * within a quoted region.
956      *
957      * @param ignored  the ignored matcher to use, null ignored
958      * @return this, to enable chaining
959      */
960     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
961         if (ignored != null) {
962             this.ignoredMatcher = ignored;
963         }
964         return this;
965     }
966 
967     /**
968      * Set the character to ignore.
969      * <p>
970      * This character is ignored when parsing the String, unless it is
971      * within a quoted region.
972      *
973      * @param ignored  the ignored character to use
974      * @return this, to enable chaining
975      */
976     public StrTokenizer setIgnoredChar(final char ignored) {
977         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
978     }
979 
980     // Trimmer
981     //-----------------------------------------------------------------------
982     /**
983      * Gets the trimmer character matcher.
984      * <p>
985      * These characters are trimmed off on each side of the delimiter
986      * until the token or quote is found.
987      * The default value is not to trim anything.
988      *
989      * @return the trimmer matcher in use
990      */
991     public StrMatcher getTrimmerMatcher() {
992         return trimmerMatcher;
993     }
994 
995     /**
996      * Sets the matcher for characters to trim.
997      * <p>
998      * These characters are trimmed off on each side of the delimiter
999      * until the token or quote is found.
1000      *
1001      * @param trimmer  the trimmer matcher to use, null ignored
1002      * @return this, to enable chaining
1003      */
1004     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1005         if (trimmer != null) {
1006             this.trimmerMatcher = trimmer;
1007         }
1008         return this;
1009     }
1010 
1011     //-----------------------------------------------------------------------
1012     /**
1013      * Gets whether the tokenizer currently returns empty tokens as null.
1014      * The default for this property is false.
1015      *
1016      * @return true if empty tokens are returned as null
1017      */
1018     public boolean isEmptyTokenAsNull() {
1019         return this.emptyAsNull;
1020     }
1021 
1022     /**
1023      * Sets whether the tokenizer should return empty tokens as null.
1024      * The default for this property is false.
1025      *
1026      * @param emptyAsNull  whether empty tokens are returned as null
1027      * @return this, to enable chaining
1028      */
1029     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1030         this.emptyAsNull = emptyAsNull;
1031         return this;
1032     }
1033 
1034     //-----------------------------------------------------------------------
1035     /**
1036      * Gets whether the tokenizer currently ignores empty tokens.
1037      * The default for this property is true.
1038      *
1039      * @return true if empty tokens are not returned
1040      */
1041     public boolean isIgnoreEmptyTokens() {
1042         return ignoreEmptyTokens;
1043     }
1044 
1045     /**
1046      * Sets whether the tokenizer should ignore and not return empty tokens.
1047      * The default for this property is true.
1048      *
1049      * @param ignoreEmptyTokens  whether empty tokens are not returned
1050      * @return this, to enable chaining
1051      */
1052     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1053         this.ignoreEmptyTokens = ignoreEmptyTokens;
1054         return this;
1055     }
1056 
1057     //-----------------------------------------------------------------------
1058     /**
1059      * Gets the String content that the tokenizer is parsing.
1060      *
1061      * @return the string content being parsed
1062      */
1063     public String getContent() {
1064         if (chars == null) {
1065             return null;
1066         }
1067         return new String(chars);
1068     }
1069 
1070     //-----------------------------------------------------------------------
1071     /**
1072      * Creates a new instance of this Tokenizer. The new instance is reset so
1073      * that it will be at the start of the token list.
1074      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1075      * 
1076      * @return a new instance of this Tokenizer which has been reset.
1077      */
1078     @Override
1079     public Object clone() {
1080         try {
1081             return cloneReset();
1082         } catch (final CloneNotSupportedException ex) {
1083             return null;
1084         }
1085     }
1086 
1087     /**
1088      * Creates a new instance of this Tokenizer. The new instance is reset so that
1089      * it will be at the start of the token list.
1090      * 
1091      * @return a new instance of this Tokenizer which has been reset.
1092      * @throws CloneNotSupportedException if there is a problem cloning
1093      */
1094     Object cloneReset() throws CloneNotSupportedException {
1095         // this method exists to enable 100% test coverage
1096         final StrTokenizer cloned = (StrTokenizer) super.clone();
1097         if (cloned.chars != null) {
1098             cloned.chars = cloned.chars.clone();
1099         }
1100         cloned.reset();
1101         return cloned;
1102     }
1103 
1104     //-----------------------------------------------------------------------
1105     /**
1106      * Gets the String content that the tokenizer is parsing.
1107      *
1108      * @return the string content being parsed
1109      */
1110     @Override
1111     public String toString() {
1112         if (tokens == null) {
1113             return "StrTokenizer[not tokenized yet]";
1114         }
1115         return "StrTokenizer" + getTokenList();
1116     }
1117 
1118 }