View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  import org.apache.commons.text.matcher.StringMatcher;
26  import org.apache.commons.text.matcher.StringMatcherFactory;
27  
28  /**
29   * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
30   * <p>
31   * This class can split a String into many smaller strings. It aims to do a similar job to
32   * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
33   * implementing the <code>ListIterator</code> interface. By default, it is set up like <code>StringTokenizer</code>.
34   * <p>
35   * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a
36   * <i>delimiter</i>. One or more delimiter characters must be specified.
37   * <p>
38   * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be
39   * escaped within a quoted section by duplicating itself.
40   * <p>
41   * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher
42   * specifies these characters. One usage might be to trim whitespace characters.
43   * <p>
44   * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies
45   * these characters to be removed. One usage might be to remove new line characters.
46   * <p>
47   * Empty tokens may be removed or returned as null.
48   *
49   * <pre>
50   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
51   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
52   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
53   * </pre>
54   *
55   * <table>
56   * <caption>StringTokenizer properties and options</caption>
57   * <tr>
58   * <th>Property</th>
59   * <th>Type</th>
60   * <th>Default</th>
61   * </tr>
62   * <tr>
63   * <td>delim</td>
64   * <td>CharSetMatcher</td>
65   * <td>{ \t\n\r\f}</td>
66   * </tr>
67   * <tr>
68   * <td>quote</td>
69   * <td>NoneMatcher</td>
70   * <td>{}</td>
71   * </tr>
72   * <tr>
73   * <td>ignore</td>
74   * <td>NoneMatcher</td>
75   * <td>{}</td>
76   * </tr>
77   * <tr>
78   * <td>emptyTokenAsNull</td>
79   * <td>boolean</td>
80   * <td>false</td>
81   * </tr>
82   * <tr>
83   * <td>ignoreEmptyTokens</td>
84   * <td>boolean</td>
85   * <td>true</td>
86   * </tr>
87   * </table>
88   *
89   * @since 1.3
90   */
91  public class StringTokenizer implements ListIterator<String>, Cloneable {
92  
93      /** Comma separated values tokenizer internal variable. */
94      private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
95      /** Tab separated values tokenizer internal variable. */
96      private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
97      static {
98          CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
99          CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
100         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
101         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
102         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
103         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
104         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
105 
106         TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
107         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
108         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
109         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
110         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
111         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
112         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
113     }
114 
115     /** The text to work on. */
116     private char[] chars;
117     /** The parsed tokens. */
118     private String[] tokens;
119     /** The current iteration position. */
120     private int tokenPos;
121 
122     /** The delimiter matcher. */
123     private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
124     /** The quote matcher. */
125     private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
126     /** The ignored matcher. */
127     private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
128     /** The trimmer matcher. */
129     private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
130 
131     /** Whether to return empty tokens as null. */
132     private boolean emptyAsNull = false;
133     /** Whether to ignore empty tokens. */
134     private boolean ignoreEmptyTokens = true;
135 
136     // -----------------------------------------------------------------------
137 
138     /**
139      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
140      *
141      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
142      */
143     private static StringTokenizer getCSVClone() {
144         return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
145     }
146 
147     /**
148      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
149      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
150      * setTrimmer method).
151      * <p>
152      * You must call a "reset" method to set the string which you want to parse.
153      *
154      * @return a new tokenizer instance which parses Comma Separated Value strings
155      */
156     public static StringTokenizer getCSVInstance() {
157         return getCSVClone();
158     }
159 
160     /**
161      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
162      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
163      * setTrimmer method).
164      *
165      * @param input
166      *            the text to parse
167      * @return a new tokenizer instance which parses Comma Separated Value strings
168      */
169     public static StringTokenizer getCSVInstance(final String input) {
170         final StringTokenizer tok = getCSVClone();
171         tok.reset(input);
172         return tok;
173     }
174 
175     /**
176      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
177      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
178      * setTrimmer method).
179      *
180      * @param input
181      *            the text to parse
182      * @return a new tokenizer instance which parses Comma Separated Value strings
183      */
184     public static StringTokenizer getCSVInstance(final char[] input) {
185         final StringTokenizer tok = getCSVClone();
186         tok.reset(input);
187         return tok;
188     }
189 
190     /**
191      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
192      *
193      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
194      */
195     private static StringTokenizer getTSVClone() {
196         return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
197     }
198 
199     /**
200      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
201      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
202      * <p>
203      * You must call a "reset" method to set the string which you want to parse.
204      *
205      * @return a new tokenizer instance which parses Tab Separated Value strings.
206      */
207     public static StringTokenizer getTSVInstance() {
208         return getTSVClone();
209     }
210 
211     /**
212      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
213      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
214      *
215      * @param input
216      *            the string to parse
217      * @return a new tokenizer instance which parses Tab Separated Value strings.
218      */
219     public static StringTokenizer getTSVInstance(final String input) {
220         final StringTokenizer tok = getTSVClone();
221         tok.reset(input);
222         return tok;
223     }
224 
225     /**
226      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
227      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
228      *
229      * @param input
230      *            the string to parse
231      * @return a new tokenizer instance which parses Tab Separated Value strings.
232      */
233     public static StringTokenizer getTSVInstance(final char[] input) {
234         final StringTokenizer tok = getTSVClone();
235         tok.reset(input);
236         return tok;
237     }
238 
239     // -----------------------------------------------------------------------
240     /**
241      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
242      * tokenize.
243      * <p>
244      * This constructor is normally used with {@link #reset(String)}.
245      */
246     public StringTokenizer() {
247         super();
248         this.chars = null;
249     }
250 
251     /**
252      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
253      *
254      * @param input
255      *            the string which is to be parsed
256      */
257     public StringTokenizer(final String input) {
258         super();
259         if (input != null) {
260             chars = input.toCharArray();
261         } else {
262             chars = null;
263         }
264     }
265 
266     /**
267      * Constructs a tokenizer splitting on the specified delimiter character.
268      *
269      * @param input
270      *            the string which is to be parsed
271      * @param delim
272      *            the field delimiter character
273      */
274     public StringTokenizer(final String input, final char delim) {
275         this(input);
276         setDelimiterChar(delim);
277     }
278 
279     /**
280      * Constructs a tokenizer splitting on the specified delimiter string.
281      *
282      * @param input
283      *            the string which is to be parsed
284      * @param delim
285      *            the field delimiter string
286      */
287     public StringTokenizer(final String input, final String delim) {
288         this(input);
289         setDelimiterString(delim);
290     }
291 
292     /**
293      * Constructs a tokenizer splitting using the specified delimiter matcher.
294      *
295      * @param input
296      *            the string which is to be parsed
297      * @param delim
298      *            the field delimiter matcher
299      */
300     public StringTokenizer(final String input, final StringMatcher delim) {
301         this(input);
302         setDelimiterMatcher(delim);
303     }
304 
305     /**
306      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
307      * quote character.
308      *
309      * @param input
310      *            the string which is to be parsed
311      * @param delim
312      *            the field delimiter character
313      * @param quote
314      *            the field quoted string character
315      */
316     public StringTokenizer(final String input, final char delim, final char quote) {
317         this(input, delim);
318         setQuoteChar(quote);
319     }
320 
321     /**
322      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
323      * quote matcher.
324      *
325      * @param input
326      *            the string which is to be parsed
327      * @param delim
328      *            the field delimiter matcher
329      * @param quote
330      *            the field quoted string matcher
331      */
332     public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
333         this(input, delim);
334         setQuoteMatcher(quote);
335     }
336 
337     /**
338      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
339      *
340      * @param input
341      *            the string which is to be parsed, not cloned
342      */
343     public StringTokenizer(final char[] input) {
344         super();
345         if (input == null) {
346             this.chars = null;
347         } else {
348             this.chars = input.clone();
349         }
350     }
351 
352     /**
353      * Constructs a tokenizer splitting on the specified character.
354      *
355      * @param input
356      *            the string which is to be parsed, not cloned
357      * @param delim
358      *            the field delimiter character
359      */
360     public StringTokenizer(final char[] input, final char delim) {
361         this(input);
362         setDelimiterChar(delim);
363     }
364 
365     /**
366      * Constructs a tokenizer splitting on the specified string.
367      *
368      * @param input
369      *            the string which is to be parsed, not cloned
370      * @param delim
371      *            the field delimiter string
372      */
373     public StringTokenizer(final char[] input, final String delim) {
374         this(input);
375         setDelimiterString(delim);
376     }
377 
378     /**
379      * Constructs a tokenizer splitting using the specified delimiter matcher.
380      *
381      * @param input
382      *            the string which is to be parsed, not cloned
383      * @param delim
384      *            the field delimiter matcher
385      */
386     public StringTokenizer(final char[] input, final StringMatcher delim) {
387         this(input);
388         setDelimiterMatcher(delim);
389     }
390 
391     /**
392      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
393      * quote character.
394      *
395      * @param input
396      *            the string which is to be parsed, not cloned
397      * @param delim
398      *            the field delimiter character
399      * @param quote
400      *            the field quoted string character
401      */
402     public StringTokenizer(final char[] input, final char delim, final char quote) {
403         this(input, delim);
404         setQuoteChar(quote);
405     }
406 
407     /**
408      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
409      * quote matcher.
410      *
411      * @param input
412      *            the string which is to be parsed, not cloned
413      * @param delim
414      *            the field delimiter character
415      * @param quote
416      *            the field quoted string character
417      */
418     public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
419         this(input, delim);
420         setQuoteMatcher(quote);
421     }
422 
423     // API
424     // -----------------------------------------------------------------------
425     /**
426      * Gets the number of tokens found in the String.
427      *
428      * @return the number of matched tokens
429      */
430     public int size() {
431         checkTokenized();
432         return tokens.length;
433     }
434 
435     /**
436      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
437      * {@link NoSuchElementException} when no tokens remain.
438      *
439      * @return the next sequential token, or null when no more tokens are found
440      */
441     public String nextToken() {
442         if (hasNext()) {
443             return tokens[tokenPos++];
444         }
445         return null;
446     }
447 
448     /**
449      * Gets the previous token from the String.
450      *
451      * @return the previous sequential token, or null when no more tokens are found
452      */
453     public String previousToken() {
454         if (hasPrevious()) {
455             return tokens[--tokenPos];
456         }
457         return null;
458     }
459 
460     /**
461      * Gets a copy of the full token list as an independent modifiable array.
462      *
463      * @return the tokens as a String array
464      */
465     public String[] getTokenArray() {
466         checkTokenized();
467         return tokens.clone();
468     }
469 
470     /**
471      * Gets a copy of the full token list as an independent modifiable list.
472      *
473      * @return the tokens as a String array
474      */
475     public List<String> getTokenList() {
476         checkTokenized();
477         final List<String> list = new ArrayList<>(tokens.length);
478         Collections.addAll(list, tokens);
479 
480         return list;
481     }
482 
483     /**
484      * Resets this tokenizer, forgetting all parsing and iteration already completed.
485      * <p>
486      * This method allows the same tokenizer to be reused for the same String.
487      *
488      * @return this, to enable chaining
489      */
490     public StringTokenizer reset() {
491         tokenPos = 0;
492         tokens = null;
493         return this;
494     }
495 
496     /**
497      * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
498      * same settings on multiple input lines.
499      *
500      * @param input
501      *            the new string to tokenize, null sets no text to parse
502      * @return this, to enable chaining
503      */
504     public StringTokenizer reset(final String input) {
505         reset();
506         if (input != null) {
507             this.chars = input.toCharArray();
508         } else {
509             this.chars = null;
510         }
511         return this;
512     }
513 
514     /**
515      * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
516      * same settings on multiple input lines.
517      *
518      * @param input
519      *            the new character array to tokenize, not cloned, null sets no text to parse
520      * @return this, to enable chaining
521      */
522     public StringTokenizer reset(final char[] input) {
523         reset();
524         if (input != null) {
525             this.chars = input.clone();
526         } else {
527             this.chars = null;
528         }
529         return this;
530     }
531 
532     // ListIterator
533     // -----------------------------------------------------------------------
534     /**
535      * Checks whether there are any more tokens.
536      *
537      * @return true if there are more tokens
538      */
539     @Override
540     public boolean hasNext() {
541         checkTokenized();
542         return tokenPos < tokens.length;
543     }
544 
545     /**
546      * Gets the next token.
547      *
548      * @return the next String token
549      * @throws NoSuchElementException
550      *             if there are no more elements
551      */
552     @Override
553     public String next() {
554         if (hasNext()) {
555             return tokens[tokenPos++];
556         }
557         throw new NoSuchElementException();
558     }
559 
560     /**
561      * Gets the index of the next token to return.
562      *
563      * @return the next token index
564      */
565     @Override
566     public int nextIndex() {
567         return tokenPos;
568     }
569 
570     /**
571      * Checks whether there are any previous tokens that can be iterated to.
572      *
573      * @return true if there are previous tokens
574      */
575     @Override
576     public boolean hasPrevious() {
577         checkTokenized();
578         return tokenPos > 0;
579     }
580 
581     /**
582      * Gets the token previous to the last returned token.
583      *
584      * @return the previous token
585      */
586     @Override
587     public String previous() {
588         if (hasPrevious()) {
589             return tokens[--tokenPos];
590         }
591         throw new NoSuchElementException();
592     }
593 
594     /**
595      * Gets the index of the previous token.
596      *
597      * @return the previous token index
598      */
599     @Override
600     public int previousIndex() {
601         return tokenPos - 1;
602     }
603 
604     /**
605      * Unsupported ListIterator operation.
606      *
607      * @throws UnsupportedOperationException
608      *             always
609      */
610     @Override
611     public void remove() {
612         throw new UnsupportedOperationException("remove() is unsupported");
613     }
614 
615     /**
616      * Unsupported ListIterator operation.
617      *
618      * @param obj
619      *            this parameter ignored.
620      * @throws UnsupportedOperationException
621      *             always
622      */
623     @Override
624     public void set(final String obj) {
625         throw new UnsupportedOperationException("set() is unsupported");
626     }
627 
628     /**
629      * Unsupported ListIterator operation.
630      *
631      * @param obj
632      *            this parameter ignored.
633      * @throws UnsupportedOperationException
634      *             always
635      */
636     @Override
637     public void add(final String obj) {
638         throw new UnsupportedOperationException("add() is unsupported");
639     }
640 
641     // Implementation
642     // -----------------------------------------------------------------------
643     /**
644      * Checks if tokenization has been done, and if not then do it.
645      */
646     private void checkTokenized() {
647         if (tokens == null) {
648             if (chars == null) {
649                 // still call tokenize as subclass may do some work
650                 final List<String> split = tokenize(null, 0, 0);
651                 tokens = split.toArray(new String[split.size()]);
652             } else {
653                 final List<String> split = tokenize(chars, 0, chars.length);
654                 tokens = split.toArray(new String[split.size()]);
655             }
656         }
657     }
658 
659     /**
660      * Internal method to performs the tokenization.
661      * <p>
662      * Most users of this class do not need to call this method. This method will be called automatically by other
663      * (public) methods when required.
664      * <p>
665      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
666      * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
667      * strings. It is also be possible to filter the results.
668      * <p>
669      * <code>StrTokenizer</code> will always pass a zero offset and a count equal to the length of the array to this
670      * method, however a subclass may pass other values, or even an entirely different array.
671      *
672      * @param srcChars
673      *            the character array being tokenized, may be null
674      * @param offset
675      *            the start position within the character array, must be valid
676      * @param count
677      *            the number of characters to tokenize, must be valid
678      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
679      */
680     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
681         if (srcChars == null || count == 0) {
682             return Collections.emptyList();
683         }
684         final TextStringBuilder buf = new TextStringBuilder();
685         final List<String> tokenList = new ArrayList<>();
686         int pos = offset;
687 
688         // loop around the entire buffer
689         while (pos >= 0 && pos < count) {
690             // find next token
691             pos = readNextToken(srcChars, pos, count, buf, tokenList);
692 
693             // handle case where end of string is a delimiter
694             if (pos >= count) {
695                 addToken(tokenList, "");
696             }
697         }
698         return tokenList;
699     }
700 
701     /**
702      * Adds a token to a list, paying attention to the parameters we've set.
703      *
704      * @param list
705      *            the list to add to
706      * @param tok
707      *            the token to add
708      */
709     private void addToken(final List<String> list, String tok) {
710         if (tok == null || tok.length() == 0) {
711             if (isIgnoreEmptyTokens()) {
712                 return;
713             }
714             if (isEmptyTokenAsNull()) {
715                 tok = null;
716             }
717         }
718         list.add(tok);
719     }
720 
721     /**
722      * Reads character by character through the String to get the next token.
723      *
724      * @param srcChars
725      *            the character array being tokenized
726      * @param start
727      *            the first character of field
728      * @param len
729      *            the length of the character array being tokenized
730      * @param workArea
731      *            a temporary work area
732      * @param tokenList
733      *            the list of parsed tokens
734      * @return the starting position of the next field (the character immediately after the delimiter), or -1 if end of
735      *         string found
736      */
737     private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
738             final List<String> tokenList) {
739         // skip all leading whitespace, unless it is the
740         // field delimiter or the quote character
741         while (start < len) {
742             final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
743                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
744             if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
745                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
746                 break;
747             }
748             start += removeLen;
749         }
750 
751         // handle reaching end
752         if (start >= len) {
753             addToken(tokenList, "");
754             return -1;
755         }
756 
757         // handle empty token
758         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
759         if (delimLen > 0) {
760             addToken(tokenList, "");
761             return start + delimLen;
762         }
763 
764         // handle found token
765         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
766         if (quoteLen > 0) {
767             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
768         }
769         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
770     }
771 
772     /**
773      * Reads a possibly quoted string token.
774      *
775      * @param srcChars
776      *            the character array being tokenized
777      * @param start
778      *            the first character of field
779      * @param len
780      *            the length of the character array being tokenized
781      * @param workArea
782      *            a temporary work area
783      * @param tokenList
784      *            the list of parsed tokens
785      * @param quoteStart
786      *            the start position of the matched quote, 0 if no quoting
787      * @param quoteLen
788      *            the length of the matched quote, 0 if no quoting
789      * @return the starting position of the next field (the character immediately after the delimiter, or if end of
790      *         string found, then the length of string
791      */
792     private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
793             final List<String> tokenList, final int quoteStart, final int quoteLen) {
794         // Loop until we've found the end of the quoted
795         // string or the end of the input
796         workArea.clear();
797         int pos = start;
798         boolean quoting = quoteLen > 0;
799         int trimStart = 0;
800 
801         while (pos < len) {
802             // quoting mode can occur several times throughout a string
803             // we must switch between quoting and non-quoting until we
804             // encounter a non-quoted delimiter, or end of string
805             if (quoting) {
806                 // In quoting mode
807 
808                 // If we've found a quote character, see if it's
809                 // followed by a second quote. If so, then we need
810                 // to actually put the quote character into the token
811                 // rather than end the token.
812                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
813                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
814                         // matched pair of quotes, thus an escaped quote
815                         workArea.append(srcChars, pos, quoteLen);
816                         pos += quoteLen * 2;
817                         trimStart = workArea.size();
818                         continue;
819                     }
820 
821                     // end of quoting
822                     quoting = false;
823                     pos += quoteLen;
824                     continue;
825                 }
826 
827                 // copy regular character from inside quotes
828                 workArea.append(srcChars[pos++]);
829                 trimStart = workArea.size();
830 
831             } else {
832                 // Not in quoting mode
833 
834                 // check for delimiter, and thus end of token
835                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
836                 if (delimLen > 0) {
837                     // return condition when end of token found
838                     addToken(tokenList, workArea.substring(0, trimStart));
839                     return pos + delimLen;
840                 }
841 
842                 // check for quote, and thus back into quoting mode
843                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
844                     quoting = true;
845                     pos += quoteLen;
846                     continue;
847                 }
848 
849                 // check for ignored (outside quotes), and ignore
850                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
851                 if (ignoredLen > 0) {
852                     pos += ignoredLen;
853                     continue;
854                 }
855 
856                 // check for trimmed character
857                 // don't yet know if its at the end, so copy to workArea
858                 // use trimStart to keep track of trim at the end
859                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
860                 if (trimmedLen > 0) {
861                     workArea.append(srcChars, pos, trimmedLen);
862                     pos += trimmedLen;
863                     continue;
864                 }
865 
866                 // copy regular character from outside quotes
867                 workArea.append(srcChars[pos++]);
868                 trimStart = workArea.size();
869             }
870         }
871 
872         // return condition when end of string found
873         addToken(tokenList, workArea.substring(0, trimStart));
874         return -1;
875     }
876 
877     /**
878      * Checks if the characters at the index specified match the quote already matched in readNextToken().
879      *
880      * @param srcChars
881      *            the character array being tokenized
882      * @param pos
883      *            the position to check for a quote
884      * @param len
885      *            the length of the character array being tokenized
886      * @param quoteStart
887      *            the start position of the matched quote, 0 if no quoting
888      * @param quoteLen
889      *            the length of the matched quote, 0 if no quoting
890      * @return true if a quote is matched
891      */
892     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
893             final int quoteLen) {
894         for (int i = 0; i < quoteLen; i++) {
895             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
896                 return false;
897             }
898         }
899         return true;
900     }
901 
902     // Delimiter
903     // -----------------------------------------------------------------------
904     /**
905      * Gets the field delimiter matcher.
906      *
907      * @return the delimiter matcher in use
908      */
909     public StringMatcher getDelimiterMatcher() {
910         return this.delimMatcher;
911     }
912 
913     /**
914      * Sets the field delimiter matcher.
915      * <p>
916      * The delimiter is used to separate one token from another.
917      *
918      * @param delim
919      *            the delimiter matcher to use
920      * @return this, to enable chaining
921      */
922     public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
923         if (delim == null) {
924             this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
925         } else {
926             this.delimMatcher = delim;
927         }
928         return this;
929     }
930 
931     /**
932      * Sets the field delimiter character.
933      *
934      * @param delim
935      *            the delimiter character to use
936      * @return this, to enable chaining
937      */
938     public StringTokenizer setDelimiterChar(final char delim) {
939         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
940     }
941 
942     /**
943      * Sets the field delimiter string.
944      *
945      * @param delim
946      *            the delimiter string to use
947      * @return this, to enable chaining
948      */
949     public StringTokenizer setDelimiterString(final String delim) {
950         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
951     }
952 
953     // Quote
954     // -----------------------------------------------------------------------
955     /**
956      * Gets the quote matcher currently in use.
957      * <p>
958      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
959      * default value is '"' (double quote).
960      *
961      * @return the quote matcher in use
962      */
963     public StringMatcher getQuoteMatcher() {
964         return quoteMatcher;
965     }
966 
967     /**
968      * Set the quote matcher to use.
969      * <p>
970      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
971      *
972      * @param quote
973      *            the quote matcher to use, null ignored
974      * @return this, to enable chaining
975      */
976     public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
977         if (quote != null) {
978             this.quoteMatcher = quote;
979         }
980         return this;
981     }
982 
983     /**
984      * Sets the quote character to use.
985      * <p>
986      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
987      *
988      * @param quote
989      *            the quote character to use
990      * @return this, to enable chaining
991      */
992     public StringTokenizer setQuoteChar(final char quote) {
993         return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
994     }
995 
996     // Ignored
997     // -----------------------------------------------------------------------
998     /**
999      * Gets the ignored character matcher.
1000      * <p>
1001      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
1002      * is not to ignore anything.
1003      *
1004      * @return the ignored matcher in use
1005      */
1006     public StringMatcher getIgnoredMatcher() {
1007         return ignoredMatcher;
1008     }
1009 
1010     /**
1011      * Set the matcher for characters to ignore.
1012      * <p>
1013      * These characters are ignored when parsing the String, unless they are within a quoted region.
1014      *
1015      * @param ignored
1016      *            the ignored matcher to use, null ignored
1017      * @return this, to enable chaining
1018      */
1019     public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1020         if (ignored != null) {
1021             this.ignoredMatcher = ignored;
1022         }
1023         return this;
1024     }
1025 
1026     /**
1027      * Set the character to ignore.
1028      * <p>
1029      * This character is ignored when parsing the String, unless it is within a quoted region.
1030      *
1031      * @param ignored
1032      *            the ignored character to use
1033      * @return this, to enable chaining
1034      */
1035     public StringTokenizer setIgnoredChar(final char ignored) {
1036         return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
1037     }
1038 
1039     // Trimmer
1040     // -----------------------------------------------------------------------
1041     /**
1042      * Gets the trimmer character matcher.
1043      * <p>
1044      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
1045      * value is not to trim anything.
1046      *
1047      * @return the trimmer matcher in use
1048      */
1049     public StringMatcher getTrimmerMatcher() {
1050         return trimmerMatcher;
1051     }
1052 
1053     /**
1054      * Sets the matcher for characters to trim.
1055      * <p>
1056      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1057      *
1058      * @param trimmer
1059      *            the trimmer matcher to use, null ignored
1060      * @return this, to enable chaining
1061      */
1062     public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1063         if (trimmer != null) {
1064             this.trimmerMatcher = trimmer;
1065         }
1066         return this;
1067     }
1068 
1069     // -----------------------------------------------------------------------
1070     /**
1071      * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false.
1072      *
1073      * @return true if empty tokens are returned as null
1074      */
1075     public boolean isEmptyTokenAsNull() {
1076         return this.emptyAsNull;
1077     }
1078 
1079     /**
1080      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
1081      *
1082      * @param emptyAsNull
1083      *            whether empty tokens are returned as null
1084      * @return this, to enable chaining
1085      */
1086     public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1087         this.emptyAsNull = emptyAsNull;
1088         return this;
1089     }
1090 
1091     // -----------------------------------------------------------------------
1092     /**
1093      * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true.
1094      *
1095      * @return true if empty tokens are not returned
1096      */
1097     public boolean isIgnoreEmptyTokens() {
1098         return ignoreEmptyTokens;
1099     }
1100 
1101     /**
1102      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1103      *
1104      * @param ignoreEmptyTokens
1105      *            whether empty tokens are not returned
1106      * @return this, to enable chaining
1107      */
1108     public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1109         this.ignoreEmptyTokens = ignoreEmptyTokens;
1110         return this;
1111     }
1112 
1113     // -----------------------------------------------------------------------
1114     /**
1115      * Gets the String content that the tokenizer is parsing.
1116      *
1117      * @return the string content being parsed
1118      */
1119     public String getContent() {
1120         if (chars == null) {
1121             return null;
1122         }
1123         return new String(chars);
1124     }
1125 
1126     // -----------------------------------------------------------------------
1127     /**
1128      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1129      * list. If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1130      *
1131      * @return a new instance of this Tokenizer which has been reset.
1132      */
1133     @Override
1134     public Object clone() {
1135         try {
1136             return cloneReset();
1137         } catch (final CloneNotSupportedException ex) {
1138             return null;
1139         }
1140     }
1141 
1142     /**
1143      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1144      * list.
1145      *
1146      * @return a new instance of this Tokenizer which has been reset.
1147      * @throws CloneNotSupportedException
1148      *             if there is a problem cloning
1149      */
1150     Object cloneReset() throws CloneNotSupportedException {
1151         // this method exists to enable 100% test coverage
1152         final StringTokenizer cloned = (StringTokenizer) super.clone();
1153         if (cloned.chars != null) {
1154             cloned.chars = cloned.chars.clone();
1155         }
1156         cloned.reset();
1157         return cloned;
1158     }
1159 
1160     // -----------------------------------------------------------------------
1161     /**
1162      * Gets the String content that the tokenizer is parsing.
1163      *
1164      * @return the string content being parsed
1165      */
1166     @Override
1167     public String toString() {
1168         if (tokens == null) {
1169             return "StringTokenizer[not tokenized yet]";
1170         }
1171         return "StringTokenizer" + getTokenList();
1172     }
1173 
1174 }