View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  import org.apache.commons.lang3.ArrayUtils;
26  import org.apache.commons.lang3.StringUtils;
27  
28  /**
29   * Tokenizes a string based on delimiters (separators)
30   * and supporting quoting and ignored character concepts.
31   * <p>
32   * This class can split a String into many smaller strings. It aims
33   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34   * however it offers much more control and flexibility including implementing
35   * the {@code ListIterator} interface. By default, it is set up
36   * like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <em>tokens</em>.
39   * Each token is separated from the next String by a <em>delimiter</em>.
40   * One or more delimiter characters must be specified.
41   * <p>
42   * Each token may be surrounded by quotes.
43   * The <em>quote</em> matcher specifies the quote character(s).
44   * A quote may be escaped within a quoted section by duplicating itself.
45   * <p>
46   * Between each token and the delimiter are potentially characters that need trimming.
47   * The <em>trimmer</em> matcher specifies these characters.
48   * One usage might be to trim whitespace characters.
49   * <p>
50   * At any point outside the quotes there might potentially be invalid characters.
51   * The <em>ignored</em> matcher specifies these characters to be removed.
52   * One usage might be to remove new line characters.
53   * <p>
54   * Empty tokens may be removed or returned as null.
55   * <pre>
56   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
57   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
58   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59   * </pre>
60   *
61   * <table>
62   *  <caption>StrTokenizer properties and options</caption>
63   *  <tr>
64   *   <th>Property</th><th>Type</th><th>Default</th>
65   *  </tr>
66   *  <tr>
67   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68   *  </tr>
69   *  <tr>
70   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
71   *  </tr>
72   *  <tr>
73   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74   *  </tr>
75   *  <tr>
76   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77   *  </tr>
78   *  <tr>
79   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80   *  </tr>
81   * </table>
82   *
83   * @since 1.0
84   * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85   */
86  @Deprecated
87  public class StrTokenizer implements ListIterator<String>, Cloneable {
88  
89      /** Comma separated values tokenizer internal variable. */
90      // @formatter:off
91      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
92              .setDelimiterMatcher(StrMatcher.commaMatcher())
93              .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
94              .setIgnoredMatcher(StrMatcher.noneMatcher())
95              .setTrimmerMatcher(StrMatcher.trimMatcher())
96              .setEmptyTokenAsNull(false)
97              .setIgnoreEmptyTokens(false);
98      // @formatter:on
99  
100     /** Tab separated values tokenizer internal variable. */
101     // @formatter:off
102     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103             .setDelimiterMatcher(StrMatcher.tabMatcher())
104             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105             .setIgnoredMatcher(StrMatcher.noneMatcher())
106             .setTrimmerMatcher(StrMatcher.trimMatcher())
107             .setEmptyTokenAsNull(false)
108             .setIgnoreEmptyTokens(false);
109     // @formatter:on
110 
111     /**
112      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113      *
114      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115      */
116     private static StrTokenizer getCSVClone() {
117         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118     }
119 
120     /**
121      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
122      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
123      * <p>
124      * You must call a "reset" method to set the string which you want to parse.
125      * </p>
126      *
127      * @return a new tokenizer instance which parses Comma Separated Value strings.
128      */
129     public static StrTokenizer getCSVInstance() {
130         return getCSVClone();
131     }
132 
133     /**
134      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
135      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
136      *
137      * @param input the text to parse.
138      * @return a new tokenizer instance which parses Comma Separated Value strings.
139      */
140     public static StrTokenizer getCSVInstance(final char[] input) {
141         final StrTokenizer tok = getCSVClone();
142         tok.reset(input);
143         return tok;
144     }
145 
146     /**
147      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
148      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
149      *
150      * @param input the text to parse.
151      * @return a new tokenizer instance which parses Comma Separated Value strings.
152      */
153     public static StrTokenizer getCSVInstance(final String input) {
154         final StrTokenizer tok = getCSVClone();
155         tok.reset(input);
156         return tok;
157     }
158 
159     /**
160      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
161      *
162      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
163      */
164     private static StrTokenizer getTSVClone() {
165         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
166     }
167 
168     /**
169      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
170      * be overridden with the setTrimmer method).
171      * <p>
172      * You must call a "reset" method to set the string which you want to parse.
173      * </p>
174      *
175      * @return a new tokenizer instance which parses Tab Separated Value strings.
176      */
177     public static StrTokenizer getTSVInstance() {
178         return getTSVClone();
179     }
180 
181     /**
182      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
183      * be overridden with the setTrimmer method).
184      *
185      * @param input the string to parse.
186      * @return a new tokenizer instance which parses Tab Separated Value strings.
187      */
188     public static StrTokenizer getTSVInstance(final char[] input) {
189         final StrTokenizer tok = getTSVClone();
190         tok.reset(input);
191         return tok;
192     }
193 
194     /**
195      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
196      * be overridden with the setTrimmer method).
197      *
198      * @param input the string to parse.
199      * @return a new tokenizer instance which parses Tab Separated Value strings.
200      */
201     public static StrTokenizer getTSVInstance(final String input) {
202         final StrTokenizer tok = getTSVClone();
203         tok.reset(input);
204         return tok;
205     }
206 
207     /** The text to work on. */
208     private char[] chars;
209 
210     /** The parsed tokens. */
211     private String[] tokens;
212 
213     /** The current iteration position. */
214     private int tokenPos;
215 
216     /** The delimiter matcher. */
217     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
218 
219     /** The quote matcher. */
220     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
221 
222     /** The ignored matcher. */
223     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
224 
225     /** The trimmer matcher. */
226     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
227 
228     /** Whether to return empty tokens as null. */
229     private boolean emptyAsNull;
230 
231     /** Whether to ignore empty tokens. */
232     private boolean ignoreEmptyTokens = true;
233 
234     /**
235      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236      * <p>
237      * This constructor is normally used with {@link #reset(String)}.
238      * </p>
239      */
240     public StrTokenizer() {
241         this.chars = null;
242     }
243 
244     /**
245      * Constructs a tokenizer splitting on space, tab, newline and form feed
246      * as per StringTokenizer.
247      *
248      * @param input  the string which is to be parsed, not cloned.
249      */
250     public StrTokenizer(final char[] input) {
251         if (input == null) {
252             this.chars = null;
253         } else {
254             this.chars = input.clone();
255         }
256     }
257 
258     /**
259      * Constructs a tokenizer splitting on the specified character.
260      *
261      * @param input  the string which is to be parsed, not cloned.
262      * @param delim the field delimiter character.
263      */
264     public StrTokenizer(final char[] input, final char delim) {
265         this(input);
266         setDelimiterChar(delim);
267     }
268 
269     /**
270      * Constructs a tokenizer splitting on the specified delimiter character
271      * and handling quotes using the specified quote character.
272      *
273      * @param input  the string which is to be parsed, not cloned.
274      * @param delim  the field delimiter character.
275      * @param quote  the field quoted string character.
276      */
277     public StrTokenizer(final char[] input, final char delim, final char quote) {
278         this(input, delim);
279         setQuoteChar(quote);
280     }
281 
282     /**
283      * Constructs a tokenizer splitting on the specified string.
284      *
285      * @param input  the string which is to be parsed, not cloned.
286      * @param delim the field delimiter string.
287      */
288     public StrTokenizer(final char[] input, final String delim) {
289         this(input);
290         setDelimiterString(delim);
291     }
292 
293     /**
294      * Constructs a tokenizer splitting using the specified delimiter matcher.
295      *
296      * @param input  the string which is to be parsed, not cloned.
297      * @param delim  the field delimiter matcher.
298      */
299     public StrTokenizer(final char[] input, final StrMatcher delim) {
300         this(input);
301         setDelimiterMatcher(delim);
302     }
303 
304     /**
305      * Constructs a tokenizer splitting using the specified delimiter matcher
306      * and handling quotes using the specified quote matcher.
307      *
308      * @param input  the string which is to be parsed, not cloned.
309      * @param delim  the field delimiter character.
310      * @param quote  the field quoted string character.
311      */
312     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
313         this(input, delim);
314         setQuoteMatcher(quote);
315     }
316 
317     /**
318      * Constructs a tokenizer splitting on space, tab, newline and form feed
319      * as per StringTokenizer.
320      *
321      * @param input  the string which is to be parsed.
322      */
323     public StrTokenizer(final String input) {
324         if (input != null) {
325             chars = input.toCharArray();
326         } else {
327             chars = null;
328         }
329     }
330 
331     /**
332      * Constructs a tokenizer splitting on the specified delimiter character.
333      *
334      * @param input  the string which is to be parsed.
335      * @param delim  the field delimiter character.
336      */
337     public StrTokenizer(final String input, final char delim) {
338         this(input);
339         setDelimiterChar(delim);
340     }
341 
342     /**
343      * Constructs a tokenizer splitting on the specified delimiter character
344      * and handling quotes using the specified quote character.
345      *
346      * @param input  the string which is to be parsed.
347      * @param delim  the field delimiter character.
348      * @param quote  the field quoted string character.
349      */
350     public StrTokenizer(final String input, final char delim, final char quote) {
351         this(input, delim);
352         setQuoteChar(quote);
353     }
354 
355     /**
356      * Constructs a tokenizer splitting on the specified delimiter string.
357      *
358      * @param input  the string which is to be parsed.
359      * @param delim  the field delimiter string.
360      */
361     public StrTokenizer(final String input, final String delim) {
362         this(input);
363         setDelimiterString(delim);
364     }
365 
366     /**
367      * Constructs a tokenizer splitting using the specified delimiter matcher.
368      *
369      * @param input  the string which is to be parsed.
370      * @param delim  the field delimiter matcher.
371      */
372     public StrTokenizer(final String input, final StrMatcher delim) {
373         this(input);
374         setDelimiterMatcher(delim);
375     }
376 
377     /**
378      * Constructs a tokenizer splitting using the specified delimiter matcher
379      * and handling quotes using the specified quote matcher.
380      *
381      * @param input  the string which is to be parsed.
382      * @param delim  the field delimiter matcher.
383      * @param quote  the field quoted string matcher.
384      */
385     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
386         this(input, delim);
387         setQuoteMatcher(quote);
388     }
389 
390     /**
391      * Unsupported ListIterator operation.
392      * @param obj this parameter ignored.
393      * @throws UnsupportedOperationException always.
394      */
395     @Override
396     public void add(final String obj) {
397         throw new UnsupportedOperationException("add() is unsupported");
398     }
399 
400     /**
401      * Adds a token to a list, paying attention to the parameters we've set.
402      *
403      * @param list  the list to add to.
404      * @param tok  the token to add.
405      */
406     private void addToken(final List<String> list, String tok) {
407         if (tok == null || tok.isEmpty()) {
408             if (isIgnoreEmptyTokens()) {
409                 return;
410             }
411             if (isEmptyTokenAsNull()) {
412                 tok = null;
413             }
414         }
415         list.add(tok);
416     }
417 
418     /**
419      * Checks if tokenization has been done, and if not then do it.
420      */
421     private void checkTokenized() {
422         if (tokens == null) {
423             if (chars == null) {
424                 // still call tokenize as subclass may do some work
425                 final List<String> split = tokenize(null, 0, 0);
426                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
427             } else {
428                 final List<String> split = tokenize(chars, 0, chars.length);
429                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
430             }
431         }
432     }
433 
434     /**
435      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
436      * {@link CloneNotSupportedException} is caught, return {@code null}.
437      *
438      * @return a new instance of this Tokenizer which has been reset.
439      */
440     @Override
441     public Object clone() {
442         try {
443             return cloneReset();
444         } catch (final CloneNotSupportedException ex) {
445             return null;
446         }
447     }
448 
449     /**
450      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
451      *
452      * @return a new instance of this Tokenizer which has been reset.
453      * @throws CloneNotSupportedException if there is a problem cloning.
454      */
455     Object cloneReset() throws CloneNotSupportedException {
456         // this method exists to enable 100% test coverage
457         final StrTokenizer cloned = (StrTokenizer) super.clone();
458         if (cloned.chars != null) {
459             cloned.chars = cloned.chars.clone();
460         }
461         cloned.reset();
462         return cloned;
463     }
464 
465     /**
466      * Gets the String content that the tokenizer is parsing.
467      *
468      * @return The string content being parsed.
469      */
470     public String getContent() {
471         if (chars == null) {
472             return null;
473         }
474         return new String(chars);
475     }
476 
477     /**
478      * Gets the field delimiter matcher.
479      *
480      * @return The delimiter matcher in use.
481      */
482     public StrMatcher getDelimiterMatcher() {
483         return this.delimMatcher;
484     }
485 
486     /**
487      * Gets the ignored character matcher.
488      * <p>
489      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
490      * </p>
491      *
492      * @return The ignored matcher in use.
493      */
494     public StrMatcher getIgnoredMatcher() {
495         return ignoredMatcher;
496     }
497 
498     /**
499      * Gets the quote matcher currently in use.
500      * <p>
501      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
502      * </p>
503      *
504      * @return The quote matcher in use.
505      */
506     public StrMatcher getQuoteMatcher() {
507         return quoteMatcher;
508     }
509 
510     /**
511      * Gets a copy of the full token list as an independent modifiable array.
512      *
513      * @return The tokens as a String array.
514      */
515     public String[] getTokenArray() {
516         checkTokenized();
517         return tokens.clone();
518     }
519 
520     /**
521      * Gets a copy of the full token list as an independent modifiable list.
522      *
523      * @return The tokens as a String array.
524      */
525     public List<String> getTokenList() {
526         checkTokenized();
527         final List<String> list = new ArrayList<>(tokens.length);
528         Collections.addAll(list, tokens);
529 
530         return list;
531     }
532 
533     /**
534      * Gets the trimmer character matcher.
535      * <p>
536      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
537      * </p>
538      *
539      * @return The trimmer matcher in use.
540      */
541     public StrMatcher getTrimmerMatcher() {
542         return trimmerMatcher;
543     }
544 
545     /**
546      * Checks whether there are any more tokens.
547      *
548      * @return true if there are more tokens.
549      */
550     @Override
551     public boolean hasNext() {
552         checkTokenized();
553         return tokenPos < tokens.length;
554     }
555 
556     /**
557      * Checks whether there are any previous tokens that can be iterated to.
558      *
559      * @return true if there are previous tokens.
560      */
561     @Override
562     public boolean hasPrevious() {
563         checkTokenized();
564         return tokenPos > 0;
565     }
566 
567     /**
568      * Gets whether the tokenizer currently returns empty tokens as null.
569      * The default for this property is false.
570      *
571      * @return true if empty tokens are returned as null.
572      */
573     public boolean isEmptyTokenAsNull() {
574         return this.emptyAsNull;
575     }
576 
577     /**
578      * Gets whether the tokenizer currently ignores empty tokens.
579      * The default for this property is true.
580      *
581      * @return true if empty tokens are not returned.
582      */
583     public boolean isIgnoreEmptyTokens() {
584         return ignoreEmptyTokens;
585     }
586 
587     /**
588      * Checks if the characters at the index specified match the quote
589      * already matched in readNextToken().
590      *
591      * @param srcChars  the character array being tokenized.
592      * @param pos  the position to check for a quote.
593      * @param len  the length of the character array being tokenized.
594      * @param quoteStart  the start position of the matched quote, 0 if no quoting.
595      * @param quoteLen  the length of the matched quote, 0 if no quoting.
596      * @return true if a quote is matched.
597      */
598     private boolean isQuote(final char[] srcChars,
599                             final int pos,
600                             final int len,
601                             final int quoteStart,
602                             final int quoteLen) {
603         for (int i = 0; i < quoteLen; i++) {
604             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
605                 return false;
606             }
607         }
608         return true;
609     }
610 
611     /**
612      * Gets the next token.
613      *
614      * @return The next String token.
615      * @throws NoSuchElementException if there are no more elements.
616      */
617     @Override
618     public String next() {
619         if (hasNext()) {
620             return tokens[tokenPos++];
621         }
622         throw new NoSuchElementException();
623     }
624 
625     /**
626      * Gets the index of the next token to return.
627      *
628      * @return The next token index.
629      */
630     @Override
631     public int nextIndex() {
632         return tokenPos;
633     }
634 
635     /**
636      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
637      * tokens remain.
638      *
639      * @return The next sequential token, or null when no more tokens are found.
640      */
641     public String nextToken() {
642         if (hasNext()) {
643             return tokens[tokenPos++];
644         }
645         return null;
646     }
647 
648     /**
649      * Gets the token previous to the last returned token.
650      *
651      * @return The previous token.
652      */
653     @Override
654     public String previous() {
655         if (hasPrevious()) {
656             return tokens[--tokenPos];
657         }
658         throw new NoSuchElementException();
659     }
660 
661     /**
662      * Gets the index of the previous token.
663      *
664      * @return The previous token index.
665      */
666     @Override
667     public int previousIndex() {
668         return tokenPos - 1;
669     }
670 
671     /**
672      * Gets the previous token from the String.
673      *
674      * @return The previous sequential token, or null when no more tokens are found.
675      */
676     public String previousToken() {
677         if (hasPrevious()) {
678             return tokens[--tokenPos];
679         }
680         return null;
681     }
682 
683     /**
684      * Reads character by character through the String to get the next token.
685      *
686      * @param srcChars  the character array being tokenized.
687      * @param start     the first character of field.
688      * @param len       the length of the character array being tokenized.
689      * @param workArea  a temporary work area.
690      * @param tokenList the list of parsed tokens.
691      * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
692      */
693     private int readNextToken(final char[] srcChars,
694                               int start,
695                               final int len,
696                               final StrBuilder workArea,
697                               final List<String> tokenList) {
698         // skip all leading whitespace, unless it is the
699         // field delimiter or the quote character
700         while (start < len) {
701             final int removeLen = Math.max(
702                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
703                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
704             if (removeLen == 0
705                     || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
706                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
707                 break;
708             }
709             start += removeLen;
710         }
711 
712         // handle reaching end
713         if (start >= len) {
714             addToken(tokenList, StringUtils.EMPTY);
715             return -1;
716         }
717 
718         // handle empty token
719         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
720         if (delimLen > 0) {
721             addToken(tokenList, StringUtils.EMPTY);
722             return start + delimLen;
723         }
724 
725         // handle found token
726         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
727         if (quoteLen > 0) {
728             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
729         }
730         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
731     }
732 
733     /**
734      * Reads a possibly quoted string token.
735      *
736      * @param srcChars   the character array being tokenized.
737      * @param start      the first character of field.
738      * @param len        the length of the character array being tokenized.
739      * @param workArea   a temporary work area.
740      * @param tokenList  the list of parsed tokens.
741      * @param quoteStart the start position of the matched quote, 0 if no quoting.
742      * @param quoteLen   the length of the matched quote, 0 if no quoting.
743      * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
744      */
745     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
746                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
747         // Loop until we've found the end of the quoted
748         // string or the end of the input
749         workArea.clear();
750         int pos = start;
751         boolean quoting = quoteLen > 0;
752         int trimStart = 0;
753 
754         while (pos < len) {
755             // quoting mode can occur several times throughout a string
756             // we must switch between quoting and non-quoting until we
757             // encounter a non-quoted delimiter, or end of string
758             if (quoting) {
759                 // In quoting mode
760 
761                 // If we've found a quote character, see if it's
762                 // followed by a second quote.  If so, then we need
763                 // to actually put the quote character into the token
764                 // rather than end the token.
765                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
766                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
767                         // matched pair of quotes, thus an escaped quote
768                         workArea.append(srcChars, pos, quoteLen);
769                         pos += quoteLen * 2;
770                         trimStart = workArea.size();
771                         continue;
772                     }
773 
774                     // end of quoting
775                     quoting = false;
776                     pos += quoteLen;
777                     continue;
778                 }
779 
780             } else {
781                 // Not in quoting mode
782 
783                 // check for delimiter, and thus end of token
784                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
785                 if (delimLen > 0) {
786                     // return condition when end of token found
787                     addToken(tokenList, workArea.substring(0, trimStart));
788                     return pos + delimLen;
789                 }
790 
791                 // check for quote, and thus back into quoting mode
792                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
793                     quoting = true;
794                     pos += quoteLen;
795                     continue;
796                 }
797 
798                 // check for ignored (outside quotes), and ignore
799                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
800                 if (ignoredLen > 0) {
801                     pos += ignoredLen;
802                     continue;
803                 }
804 
805                 // check for trimmed character
806                 // don't yet know if its at the end, so copy to workArea
807                 // use trimStart to keep track of trim at the end
808                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
809                 if (trimmedLen > 0) {
810                     workArea.append(srcChars, pos, trimmedLen);
811                     pos += trimmedLen;
812                     continue;
813                 }
814 
815             }
816             // copy regular character from inside quotes
817             workArea.append(srcChars[pos++]);
818             trimStart = workArea.size();
819         }
820 
821         // return condition when end of string found
822         addToken(tokenList, workArea.substring(0, trimStart));
823         return -1;
824     }
825 
826     /**
827      * Unsupported ListIterator operation.
828      *
829      * @throws UnsupportedOperationException always.
830      */
831     @Override
832     public void remove() {
833         throw new UnsupportedOperationException("remove() is unsupported");
834     }
835 
836     /**
837      * Resets this tokenizer, forgetting all parsing and iteration already completed.
838      * <p>
839      * This method allows the same tokenizer to be reused for the same String.
840      * </p>
841      *
842      * @return {@code this} instance.
843      */
844     public StrTokenizer reset() {
845         tokenPos = 0;
846         tokens = null;
847         return this;
848     }
849 
850     /**
851      * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
852      *
853      * @param input the new character array to tokenize, not cloned, null sets no text to parse.
854      * @return {@code this} instance.
855      */
856     public StrTokenizer reset(final char[] input) {
857         reset();
858         if (input != null) {
859             this.chars = input.clone();
860         } else {
861             this.chars = null;
862         }
863         return this;
864     }
865 
866     /**
867      * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
868      *
869      * @param input the new string to tokenize, null sets no text to parse.
870      * @return {@code this} instance.
871      */
872     public StrTokenizer reset(final String input) {
873         reset();
874         if (input != null) {
875             this.chars = input.toCharArray();
876         } else {
877             this.chars = null;
878         }
879         return this;
880     }
881 
882     /**
883      * Unsupported ListIterator operation.
884      *
885      * @param obj this parameter ignored.
886      * @throws UnsupportedOperationException Always thrown.
887      */
888     @Override
889     public void set(final String obj) {
890         throw new UnsupportedOperationException("set() is unsupported");
891     }
892 
893     /**
894      * Sets the field delimiter character.
895      *
896      * @param delim  the delimiter character to use.
897      * @return {@code this} instance.
898      */
899     public StrTokenizer setDelimiterChar(final char delim) {
900         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
901     }
902 
903     /**
904      * Sets the field delimiter matcher.
905      * <p>
906      * The delimiter is used to separate one token from another.
907      * </p>
908      *
909      * @param delim  the delimiter matcher to use.
910      * @return {@code this} instance.
911      */
912     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
913         if (delim == null) {
914             this.delimMatcher = StrMatcher.noneMatcher();
915         } else {
916             this.delimMatcher = delim;
917         }
918         return this;
919     }
920 
921     /**
922      * Sets the field delimiter string.
923      *
924      * @param delim  the delimiter string to use.
925      * @return {@code this} instance.
926      */
927     public StrTokenizer setDelimiterString(final String delim) {
928         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
929     }
930 
931     /**
932      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
933      *
934      * @param emptyAsNull whether empty tokens are returned as null.
935      * @return {@code this} instance.
936      */
937     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
938         this.emptyAsNull = emptyAsNull;
939         return this;
940     }
941 
942     /**
943      * Sets the character to ignore.
944      * <p>
945      * This character is ignored when parsing the String, unless it is within a quoted region.
946      * </p>
947      *
948      * @param ignored the ignored character to use.
949      * @return {@code this} instance.
950      */
951     public StrTokenizer setIgnoredChar(final char ignored) {
952         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
953     }
954 
955     /**
956      * Sets the matcher for characters to ignore.
957      * <p>
958      * These characters are ignored when parsing the String, unless they are within a quoted region.
959      * </p>
960      *
961      * @param ignored the ignored matcher to use, null ignored.
962      * @return {@code this} instance.
963      */
964     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
965         if (ignored != null) {
966             this.ignoredMatcher = ignored;
967         }
968         return this;
969     }
970 
971     /**
972      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
973      *
974      * @param ignoreEmptyTokens whether empty tokens are not returned.
975      * @return {@code this} instance.
976      */
977     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
978         this.ignoreEmptyTokens = ignoreEmptyTokens;
979         return this;
980     }
981 
982     /**
983      * Sets the quote character to use.
984      * <p>
985      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
986      * </p>
987      *
988      * @param quote the quote character to use.
989      * @return {@code this} instance.
990      */
991     public StrTokenizer setQuoteChar(final char quote) {
992         return setQuoteMatcher(StrMatcher.charMatcher(quote));
993     }
994 
995     /**
996      * Sets the quote matcher to use.
997      * <p>
998      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
999      * </p>
1000      *
1001      * @param quote the quote matcher to use, null ignored.
1002      * @return {@code this} instance.
1003      */
1004     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1005         if (quote != null) {
1006             this.quoteMatcher = quote;
1007         }
1008         return this;
1009     }
1010 
1011     /**
1012      * Sets the matcher for characters to trim.
1013      * <p>
1014      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1015      * </p>
1016      *
1017      * @param trimmer the trimmer matcher to use, null ignored
1018      * @return {@code this} instance.
1019      */
1020     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1021         if (trimmer != null) {
1022             this.trimmerMatcher = trimmer;
1023         }
1024         return this;
1025     }
1026 
1027     /**
1028      * Gets the number of tokens found in the String.
1029      *
1030      * @return The number of matched tokens.
1031      */
1032     public int size() {
1033         checkTokenized();
1034         return tokens.length;
1035     }
1036 
1037     /**
1038      * Internal method to performs the tokenization.
1039      * <p>
1040      * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
1041      * </p>
1042      * <p>
1043      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1044      * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1045      * </p>
1046      * <p>
1047      * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1048      * values, or even an entirely different array.
1049      * </p>
1050      *
1051      * @param srcChars the character array being tokenized, may be null.
1052      * @param offset   the start position within the character array, must be valid.
1053      * @param count    the number of characters to tokenize, must be valid.
1054      * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1055      */
1056     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1057         if (srcChars == null || count == 0) {
1058             return Collections.emptyList();
1059         }
1060         final StrBuilder buf = new StrBuilder();
1061         final List<String> tokenList = new ArrayList<>();
1062         int pos = offset;
1063 
1064         // loop around the entire buffer
1065         while (pos >= 0 && pos < count) {
1066             // find next token
1067             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1068 
1069             // handle case where end of string is a delimiter
1070             if (pos >= count) {
1071                 addToken(tokenList, StringUtils.EMPTY);
1072             }
1073         }
1074         return tokenList;
1075     }
1076 
1077     /**
1078      * Gets the String content that the tokenizer is parsing.
1079      *
1080      * @return The string content being parsed.
1081      */
1082     @Override
1083     public String toString() {
1084         if (tokens == null) {
1085             return "StrTokenizer[not tokenized yet]";
1086         }
1087         return "StrTokenizer" + getTokenList();
1088     }
1089 
1090 }