View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  import java.util.StringTokenizer;
26  
27  import org.apache.commons.lang3.ArrayUtils;
28  import org.apache.commons.lang3.StringUtils;
29  
30  /**
31   * Tokenizes a string based on delimiters (separators)
32   * and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims
35   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36   * however it offers much more control and flexibility including implementing
37   * the {@link ListIterator} interface. By default, it is set up
38   * like {@link StringTokenizer}.
39   * </p>
40   * <p>
41   * The input String is split into a number of <i>tokens</i>.
42   * Each token is separated from the next String by a <i>delimiter</i>.
43   * One or more delimiter characters must be specified.
44   * </p>
45   * <p>
46   * Each token may be surrounded by quotes.
47   * The <i>quote</i> matcher specifies the quote character(s).
48   * A quote may be escaped within a quoted section by duplicating itself.
49   * </p>
50   * <p>
51   * Between each token and the delimiter are potentially characters that need trimming.
52   * The <i>trimmer</i> matcher specifies these characters.
53   * One usage might be to trim whitespace characters.
54   * </p>
55   * <p>
56   * At any point outside the quotes there might potentially be invalid characters.
57   * The <i>ignored</i> matcher specifies these characters to be removed.
58   * One usage might be to remove new line characters.
59   * </p>
60   * <p>
61   * Empty tokens may be removed or returned as null.
62   * </p>
63   * <pre>
64   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
65   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
66   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67   * </pre>
68   *
69   * <table>
70   *  <caption>StrTokenizer properties and options</caption>
71   *  <tr>
72   *   <th>Property</th><th>Type</th><th>Default</th>
73   *  </tr>
74   *  <tr>
75   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76   *  </tr>
77   *  <tr>
78   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
79   *  </tr>
80   *  <tr>
81   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82   *  </tr>
83   *  <tr>
84   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85   *  </tr>
86   *  <tr>
87   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88   *  </tr>
89   * </table>
90   *
91   * @since 2.2
92   * @deprecated As of 3.6, use Apache Commons Text
93   * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94   * StringTokenizer</a> instead
95   */
96  @Deprecated
97  public class StrTokenizer implements ListIterator<String>, Cloneable {
98  
99      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
100     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
101     static {
102         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
103         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
104         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
105         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
106         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
107         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
108         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
109 
110         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
111         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
112         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
113         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
114         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
115         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
116         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
117     }
118 
119     /**
120      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121      *
122      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123      */
124     private static StrTokenizer getCSVClone() {
125         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126     }
127     /**
128      * Gets a new tokenizer instance which parses Comma Separated Value strings
129      * initializing it with the given input.  The default for CSV processing
130      * will be trim whitespace from both ends (which can be overridden with
131      * the setTrimmer method).
132      * <p>
133      * You must call a "reset" method to set the string which you want to parse.
134      * </p>
135      * @return a new tokenizer instance which parses Comma Separated Value strings
136      */
137     public static StrTokenizer getCSVInstance() {
138         return getCSVClone();
139     }
140     /**
141      * Gets a new tokenizer instance which parses Comma Separated Value strings
142      * initializing it with the given input.  The default for CSV processing
143      * will be trim whitespace from both ends (which can be overridden with
144      * the setTrimmer method).
145      *
146      * @param input  the text to parse
147      * @return a new tokenizer instance which parses Comma Separated Value strings
148      */
149     public static StrTokenizer getCSVInstance(final char[] input) {
150         final StrTokenizer tok = getCSVClone();
151         tok.reset(input);
152         return tok;
153     }
154 
155     /**
156      * Gets a new tokenizer instance which parses Comma Separated Value strings
157      * initializing it with the given input.  The default for CSV processing
158      * will be trim whitespace from both ends (which can be overridden with
159      * the setTrimmer method).
160      *
161      * @param input  the text to parse
162      * @return a new tokenizer instance which parses Comma Separated Value strings
163      */
164     public static StrTokenizer getCSVInstance(final String input) {
165         final StrTokenizer tok = getCSVClone();
166         tok.reset(input);
167         return tok;
168     }
169     /**
170      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171      *
172      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173      */
174     private static StrTokenizer getTSVClone() {
175         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176     }
177     /**
178      * Gets a new tokenizer instance which parses Tab Separated Value strings.
179      * The default for CSV processing will be trim whitespace from both ends
180      * (which can be overridden with the setTrimmer method).
181      * <p>
182      * You must call a "reset" method to set the string which you want to parse.
183      * </p>
184      * @return a new tokenizer instance which parses Tab Separated Value strings.
185      */
186     public static StrTokenizer getTSVInstance() {
187         return getTSVClone();
188     }
189     /**
190      * Gets a new tokenizer instance which parses Tab Separated Value strings.
191      * The default for CSV processing will be trim whitespace from both ends
192      * (which can be overridden with the setTrimmer method).
193      * @param input  the string to parse
194      * @return a new tokenizer instance which parses Tab Separated Value strings.
195      */
196     public static StrTokenizer getTSVInstance(final char[] input) {
197         final StrTokenizer tok = getTSVClone();
198         tok.reset(input);
199         return tok;
200     }
201 
202     /**
203      * Gets a new tokenizer instance which parses Tab Separated Value strings.
204      * The default for CSV processing will be trim whitespace from both ends
205      * (which can be overridden with the setTrimmer method).
206      * @param input  the string to parse
207      * @return a new tokenizer instance which parses Tab Separated Value strings.
208      */
209     public static StrTokenizer getTSVInstance(final String input) {
210         final StrTokenizer tok = getTSVClone();
211         tok.reset(input);
212         return tok;
213     }
214     /** The text to work on. */
215     private char[] chars;
216 
217 
218     /** The parsed tokens */
219     private String[] tokens;
220 
221     /** The current iteration position */
222     private int tokenPos;
223 
224     /** The delimiter matcher */
225     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
226 
227     /** The quote matcher */
228     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
229 
230     /** The ignored matcher */
231     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
232 
233 
234     /** The trimmer matcher */
235     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
236 
237     /** Whether to return empty tokens as null */
238     private boolean emptyAsNull;
239 
240     /** Whether to ignore empty tokens */
241     private boolean ignoreEmptyTokens = true;
242 
243     /**
244      * Constructs a tokenizer splitting on space, tab, newline and formfeed
245      * as per StringTokenizer, but with no text to tokenize.
246      * <p>
247      * This constructor is normally used with {@link #reset(String)}.
248      * </p>
249      */
250     public StrTokenizer() {
251         this.chars = null;
252     }
253 
254     /**
255      * Constructs a tokenizer splitting on space, tab, newline and formfeed
256      * as per StringTokenizer.
257      *
258      * @param input  the string which is to be parsed, not cloned
259      */
260     public StrTokenizer(final char[] input) {
261         this.chars = ArrayUtils.clone(input);
262     }
263 
264     /**
265      * Constructs a tokenizer splitting on the specified character.
266      *
267      * @param input  the string which is to be parsed, not cloned
268      * @param delim the field delimiter character
269      */
270     public StrTokenizer(final char[] input, final char delim) {
271         this(input);
272         setDelimiterChar(delim);
273     }
274 
275     /**
276      * Constructs a tokenizer splitting on the specified delimiter character
277      * and handling quotes using the specified quote character.
278      *
279      * @param input  the string which is to be parsed, not cloned
280      * @param delim  the field delimiter character
281      * @param quote  the field quoted string character
282      */
283     public StrTokenizer(final char[] input, final char delim, final char quote) {
284         this(input, delim);
285         setQuoteChar(quote);
286     }
287 
288     /**
289      * Constructs a tokenizer splitting on the specified string.
290      *
291      * @param input  the string which is to be parsed, not cloned
292      * @param delim the field delimiter string
293      */
294     public StrTokenizer(final char[] input, final String delim) {
295         this(input);
296         setDelimiterString(delim);
297     }
298 
299     /**
300      * Constructs a tokenizer splitting using the specified delimiter matcher.
301      *
302      * @param input  the string which is to be parsed, not cloned
303      * @param delim  the field delimiter matcher
304      */
305     public StrTokenizer(final char[] input, final StrMatcher delim) {
306         this(input);
307         setDelimiterMatcher(delim);
308     }
309 
310     /**
311      * Constructs a tokenizer splitting using the specified delimiter matcher
312      * and handling quotes using the specified quote matcher.
313      *
314      * @param input  the string which is to be parsed, not cloned
315      * @param delim  the field delimiter character
316      * @param quote  the field quoted string character
317      */
318     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
319         this(input, delim);
320         setQuoteMatcher(quote);
321     }
322 
323     /**
324      * Constructs a tokenizer splitting on space, tab, newline and formfeed
325      * as per StringTokenizer.
326      *
327      * @param input  the string which is to be parsed
328      */
329     public StrTokenizer(final String input) {
330         if (input != null) {
331             chars = input.toCharArray();
332         } else {
333             chars = null;
334         }
335     }
336 
337     /**
338      * Constructs a tokenizer splitting on the specified delimiter character.
339      *
340      * @param input  the string which is to be parsed
341      * @param delim  the field delimiter character
342      */
343     public StrTokenizer(final String input, final char delim) {
344         this(input);
345         setDelimiterChar(delim);
346     }
347 
348     /**
349      * Constructs a tokenizer splitting on the specified delimiter character
350      * and handling quotes using the specified quote character.
351      *
352      * @param input  the string which is to be parsed
353      * @param delim  the field delimiter character
354      * @param quote  the field quoted string character
355      */
356     public StrTokenizer(final String input, final char delim, final char quote) {
357         this(input, delim);
358         setQuoteChar(quote);
359     }
360 
361     /**
362      * Constructs a tokenizer splitting on the specified delimiter string.
363      *
364      * @param input  the string which is to be parsed
365      * @param delim  the field delimiter string
366      */
367     public StrTokenizer(final String input, final String delim) {
368         this(input);
369         setDelimiterString(delim);
370     }
371 
372     /**
373      * Constructs a tokenizer splitting using the specified delimiter matcher.
374      *
375      * @param input  the string which is to be parsed
376      * @param delim  the field delimiter matcher
377      */
378     public StrTokenizer(final String input, final StrMatcher delim) {
379         this(input);
380         setDelimiterMatcher(delim);
381     }
382 
383     /**
384      * Constructs a tokenizer splitting using the specified delimiter matcher
385      * and handling quotes using the specified quote matcher.
386      *
387      * @param input  the string which is to be parsed
388      * @param delim  the field delimiter matcher
389      * @param quote  the field quoted string matcher
390      */
391     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
392         this(input, delim);
393         setQuoteMatcher(quote);
394     }
395 
396     /**
397      * Unsupported ListIterator operation.
398      * @param obj this parameter ignored.
399      * @throws UnsupportedOperationException always
400      */
401     @Override
402     public void add(final String obj) {
403         throw new UnsupportedOperationException("add() is unsupported");
404     }
405 
406     /**
407      * Adds a token to a list, paying attention to the parameters we've set.
408      *
409      * @param list  the list to add to
410      * @param tok  the token to add
411      */
412     private void addToken(final List<String> list, String tok) {
413         if (StringUtils.isEmpty(tok)) {
414             if (isIgnoreEmptyTokens()) {
415                 return;
416             }
417             if (isEmptyTokenAsNull()) {
418                 tok = null;
419             }
420         }
421         list.add(tok);
422     }
423 
424     /**
425      * Checks if tokenization has been done, and if not then do it.
426      */
427     private void checkTokenized() {
428         if (tokens == null) {
429             if (chars == null) {
430                 // still call tokenize as subclass may do some work
431                 final List<String> split = tokenize(null, 0, 0);
432                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
433             } else {
434                 final List<String> split = tokenize(chars, 0, chars.length);
435                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
436             }
437         }
438     }
439 
440     /**
441      * Creates a new instance of this Tokenizer. The new instance is reset so
442      * that it will be at the start of the token list.
443      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
444      *
445      * @return a new instance of this Tokenizer which has been reset.
446      */
447     @Override
448     public Object clone() {
449         try {
450             return cloneReset();
451         } catch (final CloneNotSupportedException ex) {
452             return null;
453         }
454     }
455 
456     /**
457      * Creates a new instance of this Tokenizer. The new instance is reset so that
458      * it will be at the start of the token list.
459      *
460      * @return a new instance of this Tokenizer which has been reset.
461      * @throws CloneNotSupportedException if there is a problem cloning
462      */
463     Object cloneReset() throws CloneNotSupportedException {
464         // this method exists to enable 100% test coverage
465         final StrTokenizer cloned = (StrTokenizer) super.clone();
466         if (cloned.chars != null) {
467             cloned.chars = cloned.chars.clone();
468         }
469         cloned.reset();
470         return cloned;
471     }
472 
473     /**
474      * Gets the String content that the tokenizer is parsing.
475      *
476      * @return the string content being parsed
477      */
478     public String getContent() {
479         if (chars == null) {
480             return null;
481         }
482         return new String(chars);
483     }
484 
485     /**
486      * Gets the field delimiter matcher.
487      *
488      * @return the delimiter matcher in use
489      */
490     public StrMatcher getDelimiterMatcher() {
491         return this.delimMatcher;
492     }
493 
494     // Ignored
495     /**
496      * Gets the ignored character matcher.
497      * <p>
498      * These characters are ignored when parsing the String, unless they are
499      * within a quoted region.
500      * The default value is not to ignore anything.
501      * </p>
502      *
503      * @return the ignored matcher in use
504      */
505     public StrMatcher getIgnoredMatcher() {
506         return ignoredMatcher;
507     }
508 
509     /**
510      * Gets the quote matcher currently in use.
511      * <p>
512      * The quote character is used to wrap data between the tokens.
513      * This enables delimiters to be entered as data.
514      * The default value is '"' (double quote).
515      * </p>
516      *
517      * @return the quote matcher in use
518      */
519     public StrMatcher getQuoteMatcher() {
520         return quoteMatcher;
521     }
522 
523     /**
524      * Gets a copy of the full token list as an independent modifiable array.
525      *
526      * @return the tokens as a String array
527      */
528     public String[] getTokenArray() {
529         checkTokenized();
530         return tokens.clone();
531     }
532 
533     /**
534      * Gets a copy of the full token list as an independent modifiable list.
535      *
536      * @return the tokens as a String array
537      */
538     public List<String> getTokenList() {
539         checkTokenized();
540         final List<String> list = new ArrayList<>(tokens.length);
541         list.addAll(Arrays.asList(tokens));
542         return list;
543     }
544 
545     /**
546      * Gets the trimmer character matcher.
547      * <p>
548      * These characters are trimmed off on each side of the delimiter
549      * until the token or quote is found.
550      * The default value is not to trim anything.
551      * </p>
552      *
553      * @return the trimmer matcher in use
554      */
555     public StrMatcher getTrimmerMatcher() {
556         return trimmerMatcher;
557     }
558 
559     /**
560      * Checks whether there are any more tokens.
561      *
562      * @return true if there are more tokens
563      */
564     @Override
565     public boolean hasNext() {
566         checkTokenized();
567         return tokenPos < tokens.length;
568     }
569 
570     /**
571      * Checks whether there are any previous tokens that can be iterated to.
572      *
573      * @return true if there are previous tokens
574      */
575     @Override
576     public boolean hasPrevious() {
577         checkTokenized();
578         return tokenPos > 0;
579     }
580 
581     /**
582      * Gets whether the tokenizer currently returns empty tokens as null.
583      * The default for this property is false.
584      *
585      * @return true if empty tokens are returned as null
586      */
587     public boolean isEmptyTokenAsNull() {
588         return this.emptyAsNull;
589     }
590 
591     /**
592      * Gets whether the tokenizer currently ignores empty tokens.
593      * The default for this property is true.
594      *
595      * @return true if empty tokens are not returned
596      */
597     public boolean isIgnoreEmptyTokens() {
598         return ignoreEmptyTokens;
599     }
600 
601     /**
602      * Checks if the characters at the index specified match the quote
603      * already matched in readNextToken().
604      *
605      * @param srcChars  the character array being tokenized
606      * @param pos  the position to check for a quote
607      * @param len  the length of the character array being tokenized
608      * @param quoteStart  the start position of the matched quote, 0 if no quoting
609      * @param quoteLen  the length of the matched quote, 0 if no quoting
610      * @return true if a quote is matched
611      */
612     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
613         for (int i = 0; i < quoteLen; i++) {
614             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
615                 return false;
616             }
617         }
618         return true;
619     }
620 
621     /**
622      * Gets the next token.
623      *
624      * @return the next String token
625      * @throws NoSuchElementException if there are no more elements
626      */
627     @Override
628     public String next() {
629         if (hasNext()) {
630             return tokens[tokenPos++];
631         }
632         throw new NoSuchElementException();
633     }
634 
635     /**
636      * Gets the index of the next token to return.
637      *
638      * @return the next token index
639      */
640     @Override
641     public int nextIndex() {
642         return tokenPos;
643     }
644 
645     /**
646      * Gets the next token from the String.
647      * Equivalent to {@link #next()} except it returns null rather than
648      * throwing {@link NoSuchElementException} when no tokens remain.
649      *
650      * @return the next sequential token, or null when no more tokens are found
651      */
652     public String nextToken() {
653         if (hasNext()) {
654             return tokens[tokenPos++];
655         }
656         return null;
657     }
658 
659     /**
660      * Gets the token previous to the last returned token.
661      *
662      * @return the previous token
663      */
664     @Override
665     public String previous() {
666         if (hasPrevious()) {
667             return tokens[--tokenPos];
668         }
669         throw new NoSuchElementException();
670     }
671 
672     /**
673      * Gets the index of the previous token.
674      *
675      * @return the previous token index
676      */
677     @Override
678     public int previousIndex() {
679         return tokenPos - 1;
680     }
681 
682     /**
683      * Gets the previous token from the String.
684      *
685      * @return the previous sequential token, or null when no more tokens are found
686      */
687     public String previousToken() {
688         if (hasPrevious()) {
689             return tokens[--tokenPos];
690         }
691         return null;
692     }
693 
694     /**
695      * Reads character by character through the String to get the next token.
696      *
697      * @param srcChars  the character array being tokenized
698      * @param start  the first character of field
699      * @param len  the length of the character array being tokenized
700      * @param workArea  a temporary work area
701      * @param tokenList  the list of parsed tokens
702      * @return the starting position of the next field (the character
703      *  immediately after the delimiter), or -1 if end of string found
704      */
705     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
706         // skip all leading whitespace, unless it is the
707         // field delimiter or the quote character
708         while (start < len) {
709             final int removeLen = Math.max(
710                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
711                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
712             if (removeLen == 0 ||
713                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
714                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
715                 break;
716             }
717             start += removeLen;
718         }
719 
720         // handle reaching end
721         if (start >= len) {
722             addToken(tokenList, StringUtils.EMPTY);
723             return -1;
724         }
725 
726         // handle empty token
727         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
728         if (delimLen > 0) {
729             addToken(tokenList, StringUtils.EMPTY);
730             return start + delimLen;
731         }
732 
733         // handle found token
734         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
735         if (quoteLen > 0) {
736             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
737         }
738         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
739     }
740 
741     /**
742      * Reads a possibly quoted string token.
743      *
744      * @param srcChars  the character array being tokenized
745      * @param start  the first character of field
746      * @param len  the length of the character array being tokenized
747      * @param workArea  a temporary work area
748      * @param tokenList  the list of parsed tokens
749      * @param quoteStart  the start position of the matched quote, 0 if no quoting
750      * @param quoteLen  the length of the matched quote, 0 if no quoting
751      * @return the starting position of the next field (the character
752      *  immediately after the delimiter, or if end of string found,
753      *  then the length of string
754      */
755     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
756                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
757         // Loop until we've found the end of the quoted
758         // string or the end of the input
759         workArea.clear();
760         int pos = start;
761         boolean quoting = quoteLen > 0;
762         int trimStart = 0;
763 
764         while (pos < len) {
765             // quoting mode can occur several times throughout a string
766             // we must switch between quoting and non-quoting until we
767             // encounter a non-quoted delimiter, or end of string
768             if (quoting) {
769                 // In quoting mode
770 
771                 // If we've found a quote character, see if it's
772                 // followed by a second quote.  If so, then we need
773                 // to actually put the quote character into the token
774                 // rather than end the token.
775                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
776                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
777                         // matched pair of quotes, thus an escaped quote
778                         workArea.append(srcChars, pos, quoteLen);
779                         pos += quoteLen * 2;
780                         trimStart = workArea.size();
781                         continue;
782                     }
783 
784                     // end of quoting
785                     quoting = false;
786                     pos += quoteLen;
787                     continue;
788                 }
789 
790             } else {
791                 // Not in quoting mode
792 
793                 // check for delimiter, and thus end of token
794                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
795                 if (delimLen > 0) {
796                     // return condition when end of token found
797                     addToken(tokenList, workArea.substring(0, trimStart));
798                     return pos + delimLen;
799                 }
800 
801                 // check for quote, and thus back into quoting mode
802                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
803                     quoting = true;
804                     pos += quoteLen;
805                     continue;
806                 }
807 
808                 // check for ignored (outside quotes), and ignore
809                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
810                 if (ignoredLen > 0) {
811                     pos += ignoredLen;
812                     continue;
813                 }
814 
815                 // check for trimmed character
816                 // don't yet know if it's at the end, so copy to workArea
817                 // use trimStart to keep track of trim at the end
818                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
819                 if (trimmedLen > 0) {
820                     workArea.append(srcChars, pos, trimmedLen);
821                     pos += trimmedLen;
822                     continue;
823                 }
824             }
825             // copy regular character from inside quotes
826             workArea.append(srcChars[pos++]);
827             trimStart = workArea.size();
828         }
829 
830         // return condition when end of string found
831         addToken(tokenList, workArea.substring(0, trimStart));
832         return -1;
833     }
834 
835     /**
836      * Unsupported ListIterator operation.
837      *
838      * @throws UnsupportedOperationException always
839      */
840     @Override
841     public void remove() {
842         throw new UnsupportedOperationException("remove() is unsupported");
843     }
844 
845     /**
846      * Resets this tokenizer, forgetting all parsing and iteration already completed.
847      * <p>
848      * This method allows the same tokenizer to be reused for the same String.
849      * </p>
850      *
851      * @return this, to enable chaining
852      */
853     public StrTokenizer reset() {
854         tokenPos = 0;
855         tokens = null;
856         return this;
857     }
858 
859     /**
860      * Reset this tokenizer, giving it a new input string to parse.
861      * In this manner you can re-use a tokenizer with the same settings
862      * on multiple input lines.
863      *
864      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
865      * @return this, to enable chaining
866      */
867     public StrTokenizer reset(final char[] input) {
868         reset();
869         this.chars = ArrayUtils.clone(input);
870         return this;
871     }
872 
873     /**
874      * Reset this tokenizer, giving it a new input string to parse.
875      * In this manner you can re-use a tokenizer with the same settings
876      * on multiple input lines.
877      *
878      * @param input  the new string to tokenize, null sets no text to parse
879      * @return this, to enable chaining
880      */
881     public StrTokenizer reset(final String input) {
882         reset();
883         if (input != null) {
884             this.chars = input.toCharArray();
885         } else {
886             this.chars = null;
887         }
888         return this;
889     }
890 
891     /**
892      * Unsupported ListIterator operation.
893      * @param obj this parameter ignored.
894      * @throws UnsupportedOperationException always
895      */
896     @Override
897     public void set(final String obj) {
898         throw new UnsupportedOperationException("set() is unsupported");
899     }
900 
901     /**
902      * Sets the field delimiter character.
903      *
904      * @param delim  the delimiter character to use
905      * @return this, to enable chaining
906      */
907     public StrTokenizer setDelimiterChar(final char delim) {
908         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
909     }
910 
911     /**
912      * Sets the field delimiter matcher.
913      * <p>
914      * The delimiter is used to separate one token from another.
915      * </p>
916      *
917      * @param delim  the delimiter matcher to use
918      * @return this, to enable chaining
919      */
920     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
921         if (delim == null) {
922             this.delimMatcher = StrMatcher.noneMatcher();
923         } else {
924             this.delimMatcher = delim;
925         }
926         return this;
927     }
928 
929     /**
930      * Sets the field delimiter string.
931      *
932      * @param delim  the delimiter string to use
933      * @return this, to enable chaining
934      */
935     public StrTokenizer setDelimiterString(final String delim) {
936         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
937     }
938 
939     /**
940      * Sets whether the tokenizer should return empty tokens as null.
941      * The default for this property is false.
942      *
943      * @param emptyAsNull  whether empty tokens are returned as null
944      * @return this, to enable chaining
945      */
946     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
947         this.emptyAsNull = emptyAsNull;
948         return this;
949     }
950 
951     /**
952      * Sets the character to ignore.
953      * <p>
954      * This character is ignored when parsing the String, unless it is
955      * within a quoted region.
956      *
957      * @param ignored  the ignored character to use
958      * @return this, to enable chaining
959      */
960     public StrTokenizer setIgnoredChar(final char ignored) {
961         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
962     }
963 
964     /**
965      * Sets the matcher for characters to ignore.
966      * <p>
967      * These characters are ignored when parsing the String, unless they are
968      * within a quoted region.
969      * </p>
970      *
971      * @param ignored  the ignored matcher to use, null ignored
972      * @return this, to enable chaining
973      */
974     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
975         if (ignored != null) {
976             this.ignoredMatcher = ignored;
977         }
978         return this;
979     }
980 
981     /**
982      * Sets whether the tokenizer should ignore and not return empty tokens.
983      * The default for this property is true.
984      *
985      * @param ignoreEmptyTokens  whether empty tokens are not returned
986      * @return this, to enable chaining
987      */
988     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
989         this.ignoreEmptyTokens = ignoreEmptyTokens;
990         return this;
991     }
992 
993     /**
994      * Sets the quote character to use.
995      * <p>
996      * The quote character is used to wrap data between the tokens.
997      * This enables delimiters to be entered as data.
998      * </p>
999      *
1000      * @param quote  the quote character to use
1001      * @return this, to enable chaining
1002      */
1003     public StrTokenizer setQuoteChar(final char quote) {
1004         return setQuoteMatcher(StrMatcher.charMatcher(quote));
1005     }
1006 
1007     /**
1008      * Sets the quote matcher to use.
1009      * <p>
1010      * The quote character is used to wrap data between the tokens.
1011      * This enables delimiters to be entered as data.
1012      * </p>
1013      *
1014      * @param quote  the quote matcher to use, null ignored
1015      * @return this, to enable chaining
1016      */
1017     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1018         if (quote != null) {
1019             this.quoteMatcher = quote;
1020         }
1021         return this;
1022     }
1023 
1024     /**
1025      * Sets the matcher for characters to trim.
1026      * <p>
1027      * These characters are trimmed off on each side of the delimiter
1028      * until the token or quote is found.
1029      * </p>
1030      *
1031      * @param trimmer  the trimmer matcher to use, null ignored
1032      * @return this, to enable chaining
1033      */
1034     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1035         if (trimmer != null) {
1036             this.trimmerMatcher = trimmer;
1037         }
1038         return this;
1039     }
1040 
1041     // API
1042     /**
1043      * Gets the number of tokens found in the String.
1044      *
1045      * @return the number of matched tokens
1046      */
1047     public int size() {
1048         checkTokenized();
1049         return tokens.length;
1050     }
1051 
1052     /**
1053      * Internal method to performs the tokenization.
1054      * <p>
1055      * Most users of this class do not need to call this method. This method
1056      * will be called automatically by other (public) methods when required.
1057      * </p>
1058      * <p>
1059      * This method exists to allow subclasses to add code before or after the
1060      * tokenization. For example, a subclass could alter the character array,
1061      * offset or count to be parsed, or call the tokenizer multiple times on
1062      * multiple strings. It is also be possible to filter the results.
1063      * </p>
1064      * <p>
1065      * {@link StrTokenizer} will always pass a zero offset and a count
1066      * equal to the length of the array to this method, however a subclass
1067      * may pass other values, or even an entirely different array.
1068      * </p>
1069      *
1070      * @param srcChars  the character array being tokenized, may be null
1071      * @param offset  the start position within the character array, must be valid
1072      * @param count  the number of characters to tokenize, must be valid
1073      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
1074      */
1075     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1076         if (ArrayUtils.isEmpty(srcChars)) {
1077             return Collections.emptyList();
1078         }
1079         final StrBuilder buf = new StrBuilder();
1080         final List<String> tokenList = new ArrayList<>();
1081         int pos = offset;
1082 
1083         // loop around the entire buffer
1084         while (pos >= 0 && pos < count) {
1085             // find next token
1086             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1087 
1088             // handle case where end of string is a delimiter
1089             if (pos >= count) {
1090                 addToken(tokenList, StringUtils.EMPTY);
1091             }
1092         }
1093         return tokenList;
1094     }
1095 
1096     /**
1097      * Gets the String content that the tokenizer is parsing.
1098      *
1099      * @return the string content being parsed
1100      */
1101     @Override
1102     public String toString() {
1103         if (tokens == null) {
1104             return "StrTokenizer[not tokenized yet]";
1105         }
1106         return "StrTokenizer" + getTokenList();
1107     }
1108 
1109 }