View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  import org.apache.commons.lang3.ArrayUtils;
26  import org.apache.commons.lang3.StringUtils;
27  
28  /**
29   * Tokenizes a string based on delimiters (separators)
30   * and supporting quoting and ignored character concepts.
31   * <p>
32   * This class can split a String into many smaller strings. It aims
33   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34   * however it offers much more control and flexibility including implementing
35   * the {@code ListIterator} interface. By default, it is set up
36   * like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <em>tokens</em>.
39   * Each token is separated from the next String by a <em>delimiter</em>.
40   * One or more delimiter characters must be specified.
41   * <p>
42   * Each token may be surrounded by quotes.
43   * The <em>quote</em> matcher specifies the quote character(s).
44   * A quote may be escaped within a quoted section by duplicating itself.
45   * <p>
46   * Between each token and the delimiter are potentially characters that need trimming.
47   * The <em>trimmer</em> matcher specifies these characters.
48   * One usage might be to trim whitespace characters.
49   * <p>
50   * At any point outside the quotes there might potentially be invalid characters.
51   * The <em>ignored</em> matcher specifies these characters to be removed.
52   * One usage might be to remove new line characters.
53   * <p>
54   * Empty tokens may be removed or returned as null.
55   * <pre>
56   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
57   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
58   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59   * </pre>
60   *
61   * <table>
62   *  <caption>StrTokenizer properties and options</caption>
63   *  <tr>
64   *   <th>Property</th><th>Type</th><th>Default</th>
65   *  </tr>
66   *  <tr>
67   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68   *  </tr>
69   *  <tr>
70   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
71   *  </tr>
72   *  <tr>
73   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74   *  </tr>
75   *  <tr>
76   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77   *  </tr>
78   *  <tr>
79   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80   *  </tr>
81   * </table>
82   *
83   * @since 1.0
84   * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85   */
86  @Deprecated
87  public class StrTokenizer implements ListIterator<String>, Cloneable {
88  
89      /** Comma separated values tokenizer internal variable. */
90      // @formatter:off
91      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
92              .setDelimiterMatcher(StrMatcher.commaMatcher())
93              .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
94              .setIgnoredMatcher(StrMatcher.noneMatcher())
95              .setTrimmerMatcher(StrMatcher.trimMatcher())
96              .setEmptyTokenAsNull(false)
97              .setIgnoreEmptyTokens(false);
98      // @formatter:on
99  
100     /** Tab separated values tokenizer internal variable. */
101     // @formatter:off
102     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103             .setDelimiterMatcher(StrMatcher.tabMatcher())
104             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105             .setIgnoredMatcher(StrMatcher.noneMatcher())
106             .setTrimmerMatcher(StrMatcher.trimMatcher())
107             .setEmptyTokenAsNull(false)
108             .setIgnoreEmptyTokens(false);
109     // @formatter:on
110 
111     /**
112      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113      *
114      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115      */
116     private static StrTokenizer getCSVClone() {
117         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118     }
119 
120     /**
121      * Gets a new tokenizer instance which parses Comma Separated Value strings
122      * initializing it with the given input.  The default for CSV processing
123      * will be trim whitespace from both ends (which can be overridden with
124      * the setTrimmer method).
125      * <p>
126      * You must call a "reset" method to set the string which you want to parse.
127      * </p>
128      * @return a new tokenizer instance which parses Comma Separated Value strings
129      */
130     public static StrTokenizer getCSVInstance() {
131         return getCSVClone();
132     }
133 
134     /**
135      * Gets a new tokenizer instance which parses Comma Separated Value strings
136      * initializing it with the given input.  The default for CSV processing
137      * will be trim whitespace from both ends (which can be overridden with
138      * the setTrimmer method).
139      *
140      * @param input  the text to parse
141      * @return a new tokenizer instance which parses Comma Separated Value strings
142      */
143     public static StrTokenizer getCSVInstance(final char[] input) {
144         final StrTokenizer tok = getCSVClone();
145         tok.reset(input);
146         return tok;
147     }
148 
149     /**
150      * Gets a new tokenizer instance which parses Comma Separated Value strings
151      * initializing it with the given input.  The default for CSV processing
152      * will be trim whitespace from both ends (which can be overridden with
153      * the setTrimmer method).
154      *
155      * @param input  the text to parse
156      * @return a new tokenizer instance which parses Comma Separated Value strings
157      */
158     public static StrTokenizer getCSVInstance(final String input) {
159         final StrTokenizer tok = getCSVClone();
160         tok.reset(input);
161         return tok;
162     }
163     /**
164      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
165      *
166      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167      */
168     private static StrTokenizer getTSVClone() {
169         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
170     }
171 
172     /**
173      * Gets a new tokenizer instance which parses Tab Separated Value strings.
174      * The default for CSV processing will be trim whitespace from both ends
175      * (which can be overridden with the setTrimmer method).
176      * <p>
177      * You must call a "reset" method to set the string which you want to parse.
178      * </p>
179      * @return a new tokenizer instance which parses Tab Separated Value strings.
180      */
181     public static StrTokenizer getTSVInstance() {
182         return getTSVClone();
183     }
184 
185     /**
186      * Gets a new tokenizer instance which parses Tab Separated Value strings.
187      * The default for CSV processing will be trim whitespace from both ends
188      * (which can be overridden with the setTrimmer method).
189      * @param input  the string to parse
190      * @return a new tokenizer instance which parses Tab Separated Value strings.
191      */
192     public static StrTokenizer getTSVInstance(final char[] input) {
193         final StrTokenizer tok = getTSVClone();
194         tok.reset(input);
195         return tok;
196     }
197 
198     /**
199      * Gets a new tokenizer instance which parses Tab Separated Value strings.
200      * The default for CSV processing will be trim whitespace from both ends
201      * (which can be overridden with the setTrimmer method).
202      * @param input  the string to parse
203      * @return a new tokenizer instance which parses Tab Separated Value strings.
204      */
205     public static StrTokenizer getTSVInstance(final String input) {
206         final StrTokenizer tok = getTSVClone();
207         tok.reset(input);
208         return tok;
209     }
210 
211     /** The text to work on. */
212     private char[] chars;
213 
214     /** The parsed tokens. */
215     private String[] tokens;
216 
217     /** The current iteration position. */
218     private int tokenPos;
219 
220     /** The delimiter matcher. */
221     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
222 
223     /** The quote matcher. */
224     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
225 
226     /** The ignored matcher. */
227     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
228 
229     /** The trimmer matcher. */
230     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
231 
232     /** Whether to return empty tokens as null. */
233     private boolean emptyAsNull;
234 
235     /** Whether to ignore empty tokens. */
236     private boolean ignoreEmptyTokens = true;
237 
238     /**
239      * Constructs a tokenizer splitting on space, tab, newline and form feed
240      * as per StringTokenizer, but with no text to tokenize.
241      * <p>
242      * This constructor is normally used with {@link #reset(String)}.
243      * </p>
244      */
245     public StrTokenizer() {
246         this.chars = null;
247     }
248 
249     /**
250      * Constructs a tokenizer splitting on space, tab, newline and form feed
251      * as per StringTokenizer.
252      *
253      * @param input  the string which is to be parsed, not cloned
254      */
255     public StrTokenizer(final char[] input) {
256         if (input == null) {
257             this.chars = null;
258         } else {
259             this.chars = input.clone();
260         }
261     }
262 
263     /**
264      * Constructs a tokenizer splitting on the specified character.
265      *
266      * @param input  the string which is to be parsed, not cloned
267      * @param delim the field delimiter character
268      */
269     public StrTokenizer(final char[] input, final char delim) {
270         this(input);
271         setDelimiterChar(delim);
272     }
273 
274     /**
275      * Constructs a tokenizer splitting on the specified delimiter character
276      * and handling quotes using the specified quote character.
277      *
278      * @param input  the string which is to be parsed, not cloned
279      * @param delim  the field delimiter character
280      * @param quote  the field quoted string character
281      */
282     public StrTokenizer(final char[] input, final char delim, final char quote) {
283         this(input, delim);
284         setQuoteChar(quote);
285     }
286 
287     /**
288      * Constructs a tokenizer splitting on the specified string.
289      *
290      * @param input  the string which is to be parsed, not cloned
291      * @param delim the field delimiter string
292      */
293     public StrTokenizer(final char[] input, final String delim) {
294         this(input);
295         setDelimiterString(delim);
296     }
297 
298     /**
299      * Constructs a tokenizer splitting using the specified delimiter matcher.
300      *
301      * @param input  the string which is to be parsed, not cloned
302      * @param delim  the field delimiter matcher
303      */
304     public StrTokenizer(final char[] input, final StrMatcher delim) {
305         this(input);
306         setDelimiterMatcher(delim);
307     }
308 
309     /**
310      * Constructs a tokenizer splitting using the specified delimiter matcher
311      * and handling quotes using the specified quote matcher.
312      *
313      * @param input  the string which is to be parsed, not cloned
314      * @param delim  the field delimiter character
315      * @param quote  the field quoted string character
316      */
317     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
318         this(input, delim);
319         setQuoteMatcher(quote);
320     }
321 
322     /**
323      * Constructs a tokenizer splitting on space, tab, newline and form feed
324      * as per StringTokenizer.
325      *
326      * @param input  the string which is to be parsed
327      */
328     public StrTokenizer(final String input) {
329         if (input != null) {
330             chars = input.toCharArray();
331         } else {
332             chars = null;
333         }
334     }
335 
336     /**
337      * Constructs a tokenizer splitting on the specified delimiter character.
338      *
339      * @param input  the string which is to be parsed
340      * @param delim  the field delimiter character
341      */
342     public StrTokenizer(final String input, final char delim) {
343         this(input);
344         setDelimiterChar(delim);
345     }
346 
347     /**
348      * Constructs a tokenizer splitting on the specified delimiter character
349      * and handling quotes using the specified quote character.
350      *
351      * @param input  the string which is to be parsed
352      * @param delim  the field delimiter character
353      * @param quote  the field quoted string character
354      */
355     public StrTokenizer(final String input, final char delim, final char quote) {
356         this(input, delim);
357         setQuoteChar(quote);
358     }
359 
360     /**
361      * Constructs a tokenizer splitting on the specified delimiter string.
362      *
363      * @param input  the string which is to be parsed
364      * @param delim  the field delimiter string
365      */
366     public StrTokenizer(final String input, final String delim) {
367         this(input);
368         setDelimiterString(delim);
369     }
370 
371     /**
372      * Constructs a tokenizer splitting using the specified delimiter matcher.
373      *
374      * @param input  the string which is to be parsed
375      * @param delim  the field delimiter matcher
376      */
377     public StrTokenizer(final String input, final StrMatcher delim) {
378         this(input);
379         setDelimiterMatcher(delim);
380     }
381 
382     /**
383      * Constructs a tokenizer splitting using the specified delimiter matcher
384      * and handling quotes using the specified quote matcher.
385      *
386      * @param input  the string which is to be parsed
387      * @param delim  the field delimiter matcher
388      * @param quote  the field quoted string matcher
389      */
390     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
391         this(input, delim);
392         setQuoteMatcher(quote);
393     }
394 
395     /**
396      * Unsupported ListIterator operation.
397      * @param obj this parameter ignored.
398      * @throws UnsupportedOperationException always
399      */
400     @Override
401     public void add(final String obj) {
402         throw new UnsupportedOperationException("add() is unsupported");
403     }
404 
405     /**
406      * Adds a token to a list, paying attention to the parameters we've set.
407      *
408      * @param list  the list to add to
409      * @param tok  the token to add
410      */
411     private void addToken(final List<String> list, String tok) {
412         if (tok == null || tok.isEmpty()) {
413             if (isIgnoreEmptyTokens()) {
414                 return;
415             }
416             if (isEmptyTokenAsNull()) {
417                 tok = null;
418             }
419         }
420         list.add(tok);
421     }
422 
423     /**
424      * Checks if tokenization has been done, and if not then do it.
425      */
426     private void checkTokenized() {
427         if (tokens == null) {
428             if (chars == null) {
429                 // still call tokenize as subclass may do some work
430                 final List<String> split = tokenize(null, 0, 0);
431                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
432             } else {
433                 final List<String> split = tokenize(chars, 0, chars.length);
434                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
435             }
436         }
437     }
438 
439     /**
440      * Creates a new instance of this Tokenizer. The new instance is reset so
441      * that it will be at the start of the token list.
442      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
443      *
444      * @return a new instance of this Tokenizer which has been reset.
445      */
446     @Override
447     public Object clone() {
448         try {
449             return cloneReset();
450         } catch (final CloneNotSupportedException ex) {
451             return null;
452         }
453     }
454 
455     /**
456      * Creates a new instance of this Tokenizer. The new instance is reset so that
457      * it will be at the start of the token list.
458      *
459      * @return a new instance of this Tokenizer which has been reset.
460      * @throws CloneNotSupportedException if there is a problem cloning
461      */
462     Object cloneReset() throws CloneNotSupportedException {
463         // this method exists to enable 100% test coverage
464         final StrTokenizer cloned = (StrTokenizer) super.clone();
465         if (cloned.chars != null) {
466             cloned.chars = cloned.chars.clone();
467         }
468         cloned.reset();
469         return cloned;
470     }
471 
472     /**
473      * Gets the String content that the tokenizer is parsing.
474      *
475      * @return The string content being parsed
476      */
477     public String getContent() {
478         if (chars == null) {
479             return null;
480         }
481         return new String(chars);
482     }
483 
484     /**
485      * Gets the field delimiter matcher.
486      *
487      * @return The delimiter matcher in use
488      */
489     public StrMatcher getDelimiterMatcher() {
490         return this.delimMatcher;
491     }
492 
493     /**
494      * Gets the ignored character matcher.
495      * <p>
496      * These characters are ignored when parsing the String, unless they are
497      * within a quoted region.
498      * The default value is not to ignore anything.
499      * </p>
500      *
501      * @return The ignored matcher in use
502      */
503     public StrMatcher getIgnoredMatcher() {
504         return ignoredMatcher;
505     }
506 
507     /**
508      * Gets the quote matcher currently in use.
509      * <p>
510      * The quote character is used to wrap data between the tokens.
511      * This enables delimiters to be entered as data.
512      * The default value is '"' (double quote).
513      * </p>
514      *
515      * @return The quote matcher in use
516      */
517     public StrMatcher getQuoteMatcher() {
518         return quoteMatcher;
519     }
520 
521     /**
522      * Gets a copy of the full token list as an independent modifiable array.
523      *
524      * @return The tokens as a String array
525      */
526     public String[] getTokenArray() {
527         checkTokenized();
528         return tokens.clone();
529     }
530 
531     /**
532      * Gets a copy of the full token list as an independent modifiable list.
533      *
534      * @return The tokens as a String array
535      */
536     public List<String> getTokenList() {
537         checkTokenized();
538         final List<String> list = new ArrayList<>(tokens.length);
539         Collections.addAll(list, tokens);
540 
541         return list;
542     }
543 
544     /**
545      * Gets the trimmer character matcher.
546      * <p>
547      * These characters are trimmed off on each side of the delimiter
548      * until the token or quote is found.
549      * The default value is not to trim anything.
550      * </p>
551      *
552      * @return The trimmer matcher in use
553      */
554     public StrMatcher getTrimmerMatcher() {
555         return trimmerMatcher;
556     }
557 
558     /**
559      * Checks whether there are any more tokens.
560      *
561      * @return true if there are more tokens
562      */
563     @Override
564     public boolean hasNext() {
565         checkTokenized();
566         return tokenPos < tokens.length;
567     }
568 
569     /**
570      * Checks whether there are any previous tokens that can be iterated to.
571      *
572      * @return true if there are previous tokens
573      */
574     @Override
575     public boolean hasPrevious() {
576         checkTokenized();
577         return tokenPos > 0;
578     }
579 
580     /**
581      * Gets whether the tokenizer currently returns empty tokens as null.
582      * The default for this property is false.
583      *
584      * @return true if empty tokens are returned as null
585      */
586     public boolean isEmptyTokenAsNull() {
587         return this.emptyAsNull;
588     }
589 
590     /**
591      * Gets whether the tokenizer currently ignores empty tokens.
592      * The default for this property is true.
593      *
594      * @return true if empty tokens are not returned
595      */
596     public boolean isIgnoreEmptyTokens() {
597         return ignoreEmptyTokens;
598     }
599 
600     /**
601      * Checks if the characters at the index specified match the quote
602      * already matched in readNextToken().
603      *
604      * @param srcChars  the character array being tokenized
605      * @param pos  the position to check for a quote
606      * @param len  the length of the character array being tokenized
607      * @param quoteStart  the start position of the matched quote, 0 if no quoting
608      * @param quoteLen  the length of the matched quote, 0 if no quoting
609      * @return true if a quote is matched
610      */
611     private boolean isQuote(final char[] srcChars,
612                             final int pos,
613                             final int len,
614                             final int quoteStart,
615                             final int quoteLen) {
616         for (int i = 0; i < quoteLen; i++) {
617             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
618                 return false;
619             }
620         }
621         return true;
622     }
623 
624     /**
625      * Gets the next token.
626      *
627      * @return The next String token
628      * @throws NoSuchElementException if there are no more elements
629      */
630     @Override
631     public String next() {
632         if (hasNext()) {
633             return tokens[tokenPos++];
634         }
635         throw new NoSuchElementException();
636     }
637 
638     /**
639      * Gets the index of the next token to return.
640      *
641      * @return The next token index
642      */
643     @Override
644     public int nextIndex() {
645         return tokenPos;
646     }
647 
648     /**
649      * Gets the next token from the String.
650      * Equivalent to {@link #next()} except it returns null rather than
651      * throwing {@link NoSuchElementException} when no tokens remain.
652      *
653      * @return The next sequential token, or null when no more tokens are found
654      */
655     public String nextToken() {
656         if (hasNext()) {
657             return tokens[tokenPos++];
658         }
659         return null;
660     }
661 
662     /**
663      * Gets the token previous to the last returned token.
664      *
665      * @return The previous token
666      */
667     @Override
668     public String previous() {
669         if (hasPrevious()) {
670             return tokens[--tokenPos];
671         }
672         throw new NoSuchElementException();
673     }
674 
675     /**
676      * Gets the index of the previous token.
677      *
678      * @return The previous token index
679      */
680     @Override
681     public int previousIndex() {
682         return tokenPos - 1;
683     }
684 
685     /**
686      * Gets the previous token from the String.
687      *
688      * @return The previous sequential token, or null when no more tokens are found
689      */
690     public String previousToken() {
691         if (hasPrevious()) {
692             return tokens[--tokenPos];
693         }
694         return null;
695     }
696 
697     /**
698      * Reads character by character through the String to get the next token.
699      *
700      * @param srcChars  the character array being tokenized
701      * @param start  the first character of field
702      * @param len  the length of the character array being tokenized
703      * @param workArea  a temporary work area
704      * @param tokenList  the list of parsed tokens
705      * @return The starting position of the next field (the character
706      *  immediately after the delimiter), or -1 if end of string found
707      */
708     private int readNextToken(final char[] srcChars,
709                               int start,
710                               final int len,
711                               final StrBuilder workArea,
712                               final List<String> tokenList) {
713         // skip all leading whitespace, unless it is the
714         // field delimiter or the quote character
715         while (start < len) {
716             final int removeLen = Math.max(
717                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
718                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
719             if (removeLen == 0
720                     || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
721                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
722                 break;
723             }
724             start += removeLen;
725         }
726 
727         // handle reaching end
728         if (start >= len) {
729             addToken(tokenList, StringUtils.EMPTY);
730             return -1;
731         }
732 
733         // handle empty token
734         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
735         if (delimLen > 0) {
736             addToken(tokenList, StringUtils.EMPTY);
737             return start + delimLen;
738         }
739 
740         // handle found token
741         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
742         if (quoteLen > 0) {
743             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
744         }
745         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
746     }
747 
748     /**
749      * Reads a possibly quoted string token.
750      *
751      * @param srcChars  the character array being tokenized
752      * @param start  the first character of field
753      * @param len  the length of the character array being tokenized
754      * @param workArea  a temporary work area
755      * @param tokenList  the list of parsed tokens
756      * @param quoteStart  the start position of the matched quote, 0 if no quoting
757      * @param quoteLen  the length of the matched quote, 0 if no quoting
758      * @return The starting position of the next field (the character
759      *  immediately after the delimiter, or if end of string found,
760      *  then the length of string
761      */
762     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
763                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
764         // Loop until we've found the end of the quoted
765         // string or the end of the input
766         workArea.clear();
767         int pos = start;
768         boolean quoting = quoteLen > 0;
769         int trimStart = 0;
770 
771         while (pos < len) {
772             // quoting mode can occur several times throughout a string
773             // we must switch between quoting and non-quoting until we
774             // encounter a non-quoted delimiter, or end of string
775             if (quoting) {
776                 // In quoting mode
777 
778                 // If we've found a quote character, see if it's
779                 // followed by a second quote.  If so, then we need
780                 // to actually put the quote character into the token
781                 // rather than end the token.
782                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
783                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
784                         // matched pair of quotes, thus an escaped quote
785                         workArea.append(srcChars, pos, quoteLen);
786                         pos += quoteLen * 2;
787                         trimStart = workArea.size();
788                         continue;
789                     }
790 
791                     // end of quoting
792                     quoting = false;
793                     pos += quoteLen;
794                     continue;
795                 }
796 
797             } else {
798                 // Not in quoting mode
799 
800                 // check for delimiter, and thus end of token
801                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
802                 if (delimLen > 0) {
803                     // return condition when end of token found
804                     addToken(tokenList, workArea.substring(0, trimStart));
805                     return pos + delimLen;
806                 }
807 
808                 // check for quote, and thus back into quoting mode
809                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
810                     quoting = true;
811                     pos += quoteLen;
812                     continue;
813                 }
814 
815                 // check for ignored (outside quotes), and ignore
816                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
817                 if (ignoredLen > 0) {
818                     pos += ignoredLen;
819                     continue;
820                 }
821 
822                 // check for trimmed character
823                 // don't yet know if its at the end, so copy to workArea
824                 // use trimStart to keep track of trim at the end
825                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
826                 if (trimmedLen > 0) {
827                     workArea.append(srcChars, pos, trimmedLen);
828                     pos += trimmedLen;
829                     continue;
830                 }
831 
832             }
833             // copy regular character from inside quotes
834             workArea.append(srcChars[pos++]);
835             trimStart = workArea.size();
836         }
837 
838         // return condition when end of string found
839         addToken(tokenList, workArea.substring(0, trimStart));
840         return -1;
841     }
842 
843     /**
844      * Unsupported ListIterator operation.
845      *
846      * @throws UnsupportedOperationException always
847      */
848     @Override
849     public void remove() {
850         throw new UnsupportedOperationException("remove() is unsupported");
851     }
852 
853     /**
854      * Resets this tokenizer, forgetting all parsing and iteration already completed.
855      * <p>
856      * This method allows the same tokenizer to be reused for the same String.
857      *
858      * @return this, to enable chaining
859      */
860     public StrTokenizer reset() {
861         tokenPos = 0;
862         tokens = null;
863         return this;
864     }
865 
866     /**
867      * Reset this tokenizer, giving it a new input string to parse.
868      * In this manner you can re-use a tokenizer with the same settings
869      * on multiple input lines.
870      *
871      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
872      * @return this, to enable chaining
873      */
874     public StrTokenizer reset(final char[] input) {
875         reset();
876         if (input != null) {
877             this.chars = input.clone();
878         } else {
879             this.chars = null;
880         }
881         return this;
882     }
883 
884     /**
885      * Reset this tokenizer, giving it a new input string to parse.
886      * In this manner you can re-use a tokenizer with the same settings
887      * on multiple input lines.
888      *
889      * @param input  the new string to tokenize, null sets no text to parse
890      * @return this, to enable chaining
891      */
892     public StrTokenizer reset(final String input) {
893         reset();
894         if (input != null) {
895             this.chars = input.toCharArray();
896         } else {
897             this.chars = null;
898         }
899         return this;
900     }
901 
902     /**
903      * Unsupported ListIterator operation.
904      * @param obj this parameter ignored.
905      * @throws UnsupportedOperationException always
906      */
907     @Override
908     public void set(final String obj) {
909         throw new UnsupportedOperationException("set() is unsupported");
910     }
911 
912     /**
913      * Sets the field delimiter character.
914      *
915      * @param delim  the delimiter character to use
916      * @return this, to enable chaining
917      */
918     public StrTokenizer setDelimiterChar(final char delim) {
919         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
920     }
921 
922     /**
923      * Sets the field delimiter matcher.
924      * <p>
925      * The delimiter is used to separate one token from another.
926      * </p>
927      *
928      * @param delim  the delimiter matcher to use
929      * @return this, to enable chaining
930      */
931     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
932         if (delim == null) {
933             this.delimMatcher = StrMatcher.noneMatcher();
934         } else {
935             this.delimMatcher = delim;
936         }
937         return this;
938     }
939 
940     /**
941      * Sets the field delimiter string.
942      *
943      * @param delim  the delimiter string to use
944      * @return this, to enable chaining
945      */
946     public StrTokenizer setDelimiterString(final String delim) {
947         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
948     }
949 
950     /**
951      * Sets whether the tokenizer should return empty tokens as null.
952      * The default for this property is false.
953      *
954      * @param emptyAsNull  whether empty tokens are returned as null
955      * @return this, to enable chaining
956      */
957     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
958         this.emptyAsNull = emptyAsNull;
959         return this;
960     }
961 
962     /**
963      * Sets the character to ignore.
964      * <p>
965      * This character is ignored when parsing the String, unless it is
966      * within a quoted region.
967      * </p>
968      *
969      * @param ignored  the ignored character to use
970      * @return this, to enable chaining
971      */
972     public StrTokenizer setIgnoredChar(final char ignored) {
973         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974     }
975 
976     /**
977      * Sets the matcher for characters to ignore.
978      * <p>
979      * These characters are ignored when parsing the String, unless they are
980      * within a quoted region.
981      * </p>
982      *
983      * @param ignored  the ignored matcher to use, null ignored
984      * @return this, to enable chaining
985      */
986     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
987         if (ignored != null) {
988             this.ignoredMatcher = ignored;
989         }
990         return this;
991     }
992 
993     /**
994      * Sets whether the tokenizer should ignore and not return empty tokens.
995      * The default for this property is true.
996      *
997      * @param ignoreEmptyTokens  whether empty tokens are not returned
998      * @return this, to enable chaining
999      */
1000     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1001         this.ignoreEmptyTokens = ignoreEmptyTokens;
1002         return this;
1003     }
1004 
1005     /**
1006      * Sets the quote character to use.
1007      * <p>
1008      * The quote character is used to wrap data between the tokens.
1009      * This enables delimiters to be entered as data.
1010      * </p>
1011      *
1012      * @param quote  the quote character to use
1013      * @return this, to enable chaining
1014      */
1015     public StrTokenizer setQuoteChar(final char quote) {
1016         return setQuoteMatcher(StrMatcher.charMatcher(quote));
1017     }
1018 
1019     /**
1020      * Sets the quote matcher to use.
1021      * <p>
1022      * The quote character is used to wrap data between the tokens.
1023      * This enables delimiters to be entered as data.
1024      * </p>
1025      *
1026      * @param quote  the quote matcher to use, null ignored
1027      * @return this, to enable chaining
1028      */
1029     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1030         if (quote != null) {
1031             this.quoteMatcher = quote;
1032         }
1033         return this;
1034     }
1035 
1036     /**
1037      * Sets the matcher for characters to trim.
1038      * <p>
1039      * These characters are trimmed off on each side of the delimiter
1040      * until the token or quote is found.
1041      * </p>
1042      *
1043      * @param trimmer  the trimmer matcher to use, null ignored
1044      * @return this, to enable chaining
1045      */
1046     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1047         if (trimmer != null) {
1048             this.trimmerMatcher = trimmer;
1049         }
1050         return this;
1051     }
1052 
1053     /**
1054      * Gets the number of tokens found in the String.
1055      *
1056      * @return The number of matched tokens
1057      */
1058     public int size() {
1059         checkTokenized();
1060         return tokens.length;
1061     }
1062 
1063     /**
1064      * Internal method to performs the tokenization.
1065      * <p>
1066      * Most users of this class do not need to call this method. This method
1067      * will be called automatically by other (public) methods when required.
1068      * </p>
1069      * <p>
1070      * This method exists to allow subclasses to add code before or after the
1071      * tokenization. For example, a subclass could alter the character array,
1072      * offset or count to be parsed, or call the tokenizer multiple times on
1073      * multiple strings. It is also be possible to filter the results.
1074      * </p>
1075      * <p>
1076      * {@code StrTokenizer} will always pass a zero offset and a count
1077      * equal to the length of the array to this method, however a subclass
1078      * may pass other values, or even an entirely different array.
1079      * </p>
1080      *
1081      * @param srcChars  the character array being tokenized, may be null
1082      * @param offset  the start position within the character array, must be valid
1083      * @param count  the number of characters to tokenize, must be valid
1084      * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1085      */
1086     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1087         if (srcChars == null || count == 0) {
1088             return Collections.emptyList();
1089         }
1090         final StrBuilder buf = new StrBuilder();
1091         final List<String> tokenList = new ArrayList<>();
1092         int pos = offset;
1093 
1094         // loop around the entire buffer
1095         while (pos >= 0 && pos < count) {
1096             // find next token
1097             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1098 
1099             // handle case where end of string is a delimiter
1100             if (pos >= count) {
1101                 addToken(tokenList, StringUtils.EMPTY);
1102             }
1103         }
1104         return tokenList;
1105     }
1106 
1107     /**
1108      * Gets the String content that the tokenizer is parsing.
1109      *
1110      * @return The string content being parsed
1111      */
1112     @Override
1113     public String toString() {
1114         if (tokens == null) {
1115             return "StrTokenizer[not tokenized yet]";
1116         }
1117         return "StrTokenizer" + getTokenList();
1118     }
1119 
1120 }