View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  import org.apache.commons.lang3.ArrayUtils;
26  import org.apache.commons.lang3.StringUtils;
27  
28  /**
29   * Tokenizes a string based on delimiters (separators)
30   * and supporting quoting and ignored character concepts.
31   * <p>
32   * This class can split a String into many smaller strings. It aims
33   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34   * however it offers much more control and flexibility including implementing
35   * the {@code ListIterator} interface. By default, it is set up
36   * like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <em>tokens</em>.
39   * Each token is separated from the next String by a <em>delimiter</em>.
40   * One or more delimiter characters must be specified.
41   * <p>
42   * Each token may be surrounded by quotes.
43   * The <em>quote</em> matcher specifies the quote character(s).
44   * A quote may be escaped within a quoted section by duplicating itself.
45   * <p>
46   * Between each token and the delimiter are potentially characters that need trimming.
47   * The <em>trimmer</em> matcher specifies these characters.
48   * One usage might be to trim whitespace characters.
49   * <p>
50   * At any point outside the quotes there might potentially be invalid characters.
51   * The <em>ignored</em> matcher specifies these characters to be removed.
52   * One usage might be to remove new line characters.
53   * <p>
54   * Empty tokens may be removed or returned as null.
55   * <pre>
56   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
57   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
58   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59   * </pre>
60   *
61   * <table>
62   *  <caption>StrTokenizer properties and options</caption>
63   *  <tr>
64   *   <th>Property</th><th>Type</th><th>Default</th>
65   *  </tr>
66   *  <tr>
67   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68   *  </tr>
69   *  <tr>
70   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
71   *  </tr>
72   *  <tr>
73   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74   *  </tr>
75   *  <tr>
76   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77   *  </tr>
78   *  <tr>
79   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80   *  </tr>
81   * </table>
82   *
83   * @since 1.0
84   * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85   */
86  @Deprecated
87  public class StrTokenizer implements ListIterator<String>, Cloneable {
88  
89      /** Comma separated values tokenizer internal variable. */
90      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
91  
92      /** Tab separated values tokenizer internal variable. */
93      private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
94  
95      static {
96          CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
97          CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
98          CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
99          CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
100         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
101         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
102         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
103 
104         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
105         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
106         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
107         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
108         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
109         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
110         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
111     }
112 
113     /**
114      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115      *
116      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
117      */
118     private static StrTokenizer getCSVClone() {
119         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
120     }
121 
122     /**
123      * Gets a new tokenizer instance which parses Comma Separated Value strings
124      * initializing it with the given input.  The default for CSV processing
125      * will be trim whitespace from both ends (which can be overridden with
126      * the setTrimmer method).
127      * <p>
128      * You must call a "reset" method to set the string which you want to parse.
129      * </p>
130      * @return a new tokenizer instance which parses Comma Separated Value strings
131      */
132     public static StrTokenizer getCSVInstance() {
133         return getCSVClone();
134     }
135 
136     /**
137      * Gets a new tokenizer instance which parses Comma Separated Value strings
138      * initializing it with the given input.  The default for CSV processing
139      * will be trim whitespace from both ends (which can be overridden with
140      * the setTrimmer method).
141      *
142      * @param input  the text to parse
143      * @return a new tokenizer instance which parses Comma Separated Value strings
144      */
145     public static StrTokenizer getCSVInstance(final char[] input) {
146         final StrTokenizer tok = getCSVClone();
147         tok.reset(input);
148         return tok;
149     }
150 
151     /**
152      * Gets a new tokenizer instance which parses Comma Separated Value strings
153      * initializing it with the given input.  The default for CSV processing
154      * will be trim whitespace from both ends (which can be overridden with
155      * the setTrimmer method).
156      *
157      * @param input  the text to parse
158      * @return a new tokenizer instance which parses Comma Separated Value strings
159      */
160     public static StrTokenizer getCSVInstance(final String input) {
161         final StrTokenizer tok = getCSVClone();
162         tok.reset(input);
163         return tok;
164     }
165     /**
166      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167      *
168      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
169      */
170     private static StrTokenizer getTSVClone() {
171         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
172     }
173 
174     /**
175      * Gets a new tokenizer instance which parses Tab Separated Value strings.
176      * The default for CSV processing will be trim whitespace from both ends
177      * (which can be overridden with the setTrimmer method).
178      * <p>
179      * You must call a "reset" method to set the string which you want to parse.
180      * </p>
181      * @return a new tokenizer instance which parses Tab Separated Value strings.
182      */
183     public static StrTokenizer getTSVInstance() {
184         return getTSVClone();
185     }
186 
187     /**
188      * Gets a new tokenizer instance which parses Tab Separated Value strings.
189      * The default for CSV processing will be trim whitespace from both ends
190      * (which can be overridden with the setTrimmer method).
191      * @param input  the string to parse
192      * @return a new tokenizer instance which parses Tab Separated Value strings.
193      */
194     public static StrTokenizer getTSVInstance(final char[] input) {
195         final StrTokenizer tok = getTSVClone();
196         tok.reset(input);
197         return tok;
198     }
199 
200     /**
201      * Gets a new tokenizer instance which parses Tab Separated Value strings.
202      * The default for CSV processing will be trim whitespace from both ends
203      * (which can be overridden with the setTrimmer method).
204      * @param input  the string to parse
205      * @return a new tokenizer instance which parses Tab Separated Value strings.
206      */
207     public static StrTokenizer getTSVInstance(final String input) {
208         final StrTokenizer tok = getTSVClone();
209         tok.reset(input);
210         return tok;
211     }
212 
213     /** The text to work on. */
214     private char[] chars;
215 
216     /** The parsed tokens. */
217     private String[] tokens;
218 
219     /** The current iteration position. */
220     private int tokenPos;
221 
222     /** The delimiter matcher. */
223     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
224 
225     /** The quote matcher. */
226     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
227 
228     /** The ignored matcher. */
229     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
230 
231     /** The trimmer matcher. */
232     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
233 
234     /** Whether to return empty tokens as null. */
235     private boolean emptyAsNull;
236 
237     /** Whether to ignore empty tokens. */
238     private boolean ignoreEmptyTokens = true;
239 
240     /**
241      * Constructs a tokenizer splitting on space, tab, newline and form feed
242      * as per StringTokenizer, but with no text to tokenize.
243      * <p>
244      * This constructor is normally used with {@link #reset(String)}.
245      * </p>
246      */
247     public StrTokenizer() {
248         this.chars = null;
249     }
250 
251     /**
252      * Constructs a tokenizer splitting on space, tab, newline and form feed
253      * as per StringTokenizer.
254      *
255      * @param input  the string which is to be parsed, not cloned
256      */
257     public StrTokenizer(final char[] input) {
258         if (input == null) {
259             this.chars = null;
260         } else {
261             this.chars = input.clone();
262         }
263     }
264 
265     /**
266      * Constructs a tokenizer splitting on the specified character.
267      *
268      * @param input  the string which is to be parsed, not cloned
269      * @param delim the field delimiter character
270      */
271     public StrTokenizer(final char[] input, final char delim) {
272         this(input);
273         setDelimiterChar(delim);
274     }
275 
276     /**
277      * Constructs a tokenizer splitting on the specified delimiter character
278      * and handling quotes using the specified quote character.
279      *
280      * @param input  the string which is to be parsed, not cloned
281      * @param delim  the field delimiter character
282      * @param quote  the field quoted string character
283      */
284     public StrTokenizer(final char[] input, final char delim, final char quote) {
285         this(input, delim);
286         setQuoteChar(quote);
287     }
288 
289     /**
290      * Constructs a tokenizer splitting on the specified string.
291      *
292      * @param input  the string which is to be parsed, not cloned
293      * @param delim the field delimiter string
294      */
295     public StrTokenizer(final char[] input, final String delim) {
296         this(input);
297         setDelimiterString(delim);
298     }
299 
300     /**
301      * Constructs a tokenizer splitting using the specified delimiter matcher.
302      *
303      * @param input  the string which is to be parsed, not cloned
304      * @param delim  the field delimiter matcher
305      */
306     public StrTokenizer(final char[] input, final StrMatcher delim) {
307         this(input);
308         setDelimiterMatcher(delim);
309     }
310 
311     /**
312      * Constructs a tokenizer splitting using the specified delimiter matcher
313      * and handling quotes using the specified quote matcher.
314      *
315      * @param input  the string which is to be parsed, not cloned
316      * @param delim  the field delimiter character
317      * @param quote  the field quoted string character
318      */
319     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
320         this(input, delim);
321         setQuoteMatcher(quote);
322     }
323 
324     /**
325      * Constructs a tokenizer splitting on space, tab, newline and form feed
326      * as per StringTokenizer.
327      *
328      * @param input  the string which is to be parsed
329      */
330     public StrTokenizer(final String input) {
331         if (input != null) {
332             chars = input.toCharArray();
333         } else {
334             chars = null;
335         }
336     }
337 
338     /**
339      * Constructs a tokenizer splitting on the specified delimiter character.
340      *
341      * @param input  the string which is to be parsed
342      * @param delim  the field delimiter character
343      */
344     public StrTokenizer(final String input, final char delim) {
345         this(input);
346         setDelimiterChar(delim);
347     }
348 
349     /**
350      * Constructs a tokenizer splitting on the specified delimiter character
351      * and handling quotes using the specified quote character.
352      *
353      * @param input  the string which is to be parsed
354      * @param delim  the field delimiter character
355      * @param quote  the field quoted string character
356      */
357     public StrTokenizer(final String input, final char delim, final char quote) {
358         this(input, delim);
359         setQuoteChar(quote);
360     }
361 
362     /**
363      * Constructs a tokenizer splitting on the specified delimiter string.
364      *
365      * @param input  the string which is to be parsed
366      * @param delim  the field delimiter string
367      */
368     public StrTokenizer(final String input, final String delim) {
369         this(input);
370         setDelimiterString(delim);
371     }
372 
373     /**
374      * Constructs a tokenizer splitting using the specified delimiter matcher.
375      *
376      * @param input  the string which is to be parsed
377      * @param delim  the field delimiter matcher
378      */
379     public StrTokenizer(final String input, final StrMatcher delim) {
380         this(input);
381         setDelimiterMatcher(delim);
382     }
383 
384     /**
385      * Constructs a tokenizer splitting using the specified delimiter matcher
386      * and handling quotes using the specified quote matcher.
387      *
388      * @param input  the string which is to be parsed
389      * @param delim  the field delimiter matcher
390      * @param quote  the field quoted string matcher
391      */
392     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
393         this(input, delim);
394         setQuoteMatcher(quote);
395     }
396 
397     /**
398      * Unsupported ListIterator operation.
399      * @param obj this parameter ignored.
400      * @throws UnsupportedOperationException always
401      */
402     @Override
403     public void add(final String obj) {
404         throw new UnsupportedOperationException("add() is unsupported");
405     }
406 
407     /**
408      * Adds a token to a list, paying attention to the parameters we've set.
409      *
410      * @param list  the list to add to
411      * @param tok  the token to add
412      */
413     private void addToken(final List<String> list, String tok) {
414         if (tok == null || tok.isEmpty()) {
415             if (isIgnoreEmptyTokens()) {
416                 return;
417             }
418             if (isEmptyTokenAsNull()) {
419                 tok = null;
420             }
421         }
422         list.add(tok);
423     }
424 
425     /**
426      * Checks if tokenization has been done, and if not then do it.
427      */
428     private void checkTokenized() {
429         if (tokens == null) {
430             if (chars == null) {
431                 // still call tokenize as subclass may do some work
432                 final List<String> split = tokenize(null, 0, 0);
433                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
434             } else {
435                 final List<String> split = tokenize(chars, 0, chars.length);
436                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
437             }
438         }
439     }
440 
441     /**
442      * Creates a new instance of this Tokenizer. The new instance is reset so
443      * that it will be at the start of the token list.
444      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
445      *
446      * @return a new instance of this Tokenizer which has been reset.
447      */
448     @Override
449     public Object clone() {
450         try {
451             return cloneReset();
452         } catch (final CloneNotSupportedException ex) {
453             return null;
454         }
455     }
456 
457     /**
458      * Creates a new instance of this Tokenizer. The new instance is reset so that
459      * it will be at the start of the token list.
460      *
461      * @return a new instance of this Tokenizer which has been reset.
462      * @throws CloneNotSupportedException if there is a problem cloning
463      */
464     Object cloneReset() throws CloneNotSupportedException {
465         // this method exists to enable 100% test coverage
466         final StrTokenizer cloned = (StrTokenizer) super.clone();
467         if (cloned.chars != null) {
468             cloned.chars = cloned.chars.clone();
469         }
470         cloned.reset();
471         return cloned;
472     }
473 
474     /**
475      * Gets the String content that the tokenizer is parsing.
476      *
477      * @return The string content being parsed
478      */
479     public String getContent() {
480         if (chars == null) {
481             return null;
482         }
483         return new String(chars);
484     }
485 
486     /**
487      * Gets the field delimiter matcher.
488      *
489      * @return The delimiter matcher in use
490      */
491     public StrMatcher getDelimiterMatcher() {
492         return this.delimMatcher;
493     }
494 
495     /**
496      * Gets the ignored character matcher.
497      * <p>
498      * These characters are ignored when parsing the String, unless they are
499      * within a quoted region.
500      * The default value is not to ignore anything.
501      * </p>
502      *
503      * @return The ignored matcher in use
504      */
505     public StrMatcher getIgnoredMatcher() {
506         return ignoredMatcher;
507     }
508 
509     /**
510      * Gets the quote matcher currently in use.
511      * <p>
512      * The quote character is used to wrap data between the tokens.
513      * This enables delimiters to be entered as data.
514      * The default value is '"' (double quote).
515      * </p>
516      *
517      * @return The quote matcher in use
518      */
519     public StrMatcher getQuoteMatcher() {
520         return quoteMatcher;
521     }
522 
523     /**
524      * Gets a copy of the full token list as an independent modifiable array.
525      *
526      * @return The tokens as a String array
527      */
528     public String[] getTokenArray() {
529         checkTokenized();
530         return tokens.clone();
531     }
532 
533     /**
534      * Gets a copy of the full token list as an independent modifiable list.
535      *
536      * @return The tokens as a String array
537      */
538     public List<String> getTokenList() {
539         checkTokenized();
540         final List<String> list = new ArrayList<>(tokens.length);
541         Collections.addAll(list, tokens);
542 
543         return list;
544     }
545 
546     /**
547      * Gets the trimmer character matcher.
548      * <p>
549      * These characters are trimmed off on each side of the delimiter
550      * until the token or quote is found.
551      * The default value is not to trim anything.
552      * </p>
553      *
554      * @return The trimmer matcher in use
555      */
556     public StrMatcher getTrimmerMatcher() {
557         return trimmerMatcher;
558     }
559 
560     /**
561      * Checks whether there are any more tokens.
562      *
563      * @return true if there are more tokens
564      */
565     @Override
566     public boolean hasNext() {
567         checkTokenized();
568         return tokenPos < tokens.length;
569     }
570 
571     /**
572      * Checks whether there are any previous tokens that can be iterated to.
573      *
574      * @return true if there are previous tokens
575      */
576     @Override
577     public boolean hasPrevious() {
578         checkTokenized();
579         return tokenPos > 0;
580     }
581 
582     /**
583      * Gets whether the tokenizer currently returns empty tokens as null.
584      * The default for this property is false.
585      *
586      * @return true if empty tokens are returned as null
587      */
588     public boolean isEmptyTokenAsNull() {
589         return this.emptyAsNull;
590     }
591 
592     /**
593      * Gets whether the tokenizer currently ignores empty tokens.
594      * The default for this property is true.
595      *
596      * @return true if empty tokens are not returned
597      */
598     public boolean isIgnoreEmptyTokens() {
599         return ignoreEmptyTokens;
600     }
601 
602     /**
603      * Checks if the characters at the index specified match the quote
604      * already matched in readNextToken().
605      *
606      * @param srcChars  the character array being tokenized
607      * @param pos  the position to check for a quote
608      * @param len  the length of the character array being tokenized
609      * @param quoteStart  the start position of the matched quote, 0 if no quoting
610      * @param quoteLen  the length of the matched quote, 0 if no quoting
611      * @return true if a quote is matched
612      */
613     private boolean isQuote(final char[] srcChars,
614                             final int pos,
615                             final int len,
616                             final int quoteStart,
617                             final int quoteLen) {
618         for (int i = 0; i < quoteLen; i++) {
619             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
620                 return false;
621             }
622         }
623         return true;
624     }
625 
626     /**
627      * Gets the next token.
628      *
629      * @return The next String token
630      * @throws NoSuchElementException if there are no more elements
631      */
632     @Override
633     public String next() {
634         if (hasNext()) {
635             return tokens[tokenPos++];
636         }
637         throw new NoSuchElementException();
638     }
639 
640     /**
641      * Gets the index of the next token to return.
642      *
643      * @return The next token index
644      */
645     @Override
646     public int nextIndex() {
647         return tokenPos;
648     }
649 
650     /**
651      * Gets the next token from the String.
652      * Equivalent to {@link #next()} except it returns null rather than
653      * throwing {@link NoSuchElementException} when no tokens remain.
654      *
655      * @return The next sequential token, or null when no more tokens are found
656      */
657     public String nextToken() {
658         if (hasNext()) {
659             return tokens[tokenPos++];
660         }
661         return null;
662     }
663 
664     /**
665      * Gets the token previous to the last returned token.
666      *
667      * @return The previous token
668      */
669     @Override
670     public String previous() {
671         if (hasPrevious()) {
672             return tokens[--tokenPos];
673         }
674         throw new NoSuchElementException();
675     }
676 
677     /**
678      * Gets the index of the previous token.
679      *
680      * @return The previous token index
681      */
682     @Override
683     public int previousIndex() {
684         return tokenPos - 1;
685     }
686 
687     /**
688      * Gets the previous token from the String.
689      *
690      * @return The previous sequential token, or null when no more tokens are found
691      */
692     public String previousToken() {
693         if (hasPrevious()) {
694             return tokens[--tokenPos];
695         }
696         return null;
697     }
698 
699     /**
700      * Reads character by character through the String to get the next token.
701      *
702      * @param srcChars  the character array being tokenized
703      * @param start  the first character of field
704      * @param len  the length of the character array being tokenized
705      * @param workArea  a temporary work area
706      * @param tokenList  the list of parsed tokens
707      * @return The starting position of the next field (the character
708      *  immediately after the delimiter), or -1 if end of string found
709      */
710     private int readNextToken(final char[] srcChars,
711                               int start,
712                               final int len,
713                               final StrBuilder workArea,
714                               final List<String> tokenList) {
715         // skip all leading whitespace, unless it is the
716         // field delimiter or the quote character
717         while (start < len) {
718             final int removeLen = Math.max(
719                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
720                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
721             if (removeLen == 0
722                     || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
723                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
724                 break;
725             }
726             start += removeLen;
727         }
728 
729         // handle reaching end
730         if (start >= len) {
731             addToken(tokenList, StringUtils.EMPTY);
732             return -1;
733         }
734 
735         // handle empty token
736         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
737         if (delimLen > 0) {
738             addToken(tokenList, StringUtils.EMPTY);
739             return start + delimLen;
740         }
741 
742         // handle found token
743         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
744         if (quoteLen > 0) {
745             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
746         }
747         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
748     }
749 
750     /**
751      * Reads a possibly quoted string token.
752      *
753      * @param srcChars  the character array being tokenized
754      * @param start  the first character of field
755      * @param len  the length of the character array being tokenized
756      * @param workArea  a temporary work area
757      * @param tokenList  the list of parsed tokens
758      * @param quoteStart  the start position of the matched quote, 0 if no quoting
759      * @param quoteLen  the length of the matched quote, 0 if no quoting
760      * @return The starting position of the next field (the character
761      *  immediately after the delimiter, or if end of string found,
762      *  then the length of string
763      */
764     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
765                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
766         // Loop until we've found the end of the quoted
767         // string or the end of the input
768         workArea.clear();
769         int pos = start;
770         boolean quoting = quoteLen > 0;
771         int trimStart = 0;
772 
773         while (pos < len) {
774             // quoting mode can occur several times throughout a string
775             // we must switch between quoting and non-quoting until we
776             // encounter a non-quoted delimiter, or end of string
777             if (quoting) {
778                 // In quoting mode
779 
780                 // If we've found a quote character, see if it's
781                 // followed by a second quote.  If so, then we need
782                 // to actually put the quote character into the token
783                 // rather than end the token.
784                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
785                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
786                         // matched pair of quotes, thus an escaped quote
787                         workArea.append(srcChars, pos, quoteLen);
788                         pos += quoteLen * 2;
789                         trimStart = workArea.size();
790                         continue;
791                     }
792 
793                     // end of quoting
794                     quoting = false;
795                     pos += quoteLen;
796                     continue;
797                 }
798 
799             } else {
800                 // Not in quoting mode
801 
802                 // check for delimiter, and thus end of token
803                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
804                 if (delimLen > 0) {
805                     // return condition when end of token found
806                     addToken(tokenList, workArea.substring(0, trimStart));
807                     return pos + delimLen;
808                 }
809 
810                 // check for quote, and thus back into quoting mode
811                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
812                     quoting = true;
813                     pos += quoteLen;
814                     continue;
815                 }
816 
817                 // check for ignored (outside quotes), and ignore
818                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
819                 if (ignoredLen > 0) {
820                     pos += ignoredLen;
821                     continue;
822                 }
823 
824                 // check for trimmed character
825                 // don't yet know if its at the end, so copy to workArea
826                 // use trimStart to keep track of trim at the end
827                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
828                 if (trimmedLen > 0) {
829                     workArea.append(srcChars, pos, trimmedLen);
830                     pos += trimmedLen;
831                     continue;
832                 }
833 
834             }
835             // copy regular character from inside quotes
836             workArea.append(srcChars[pos++]);
837             trimStart = workArea.size();
838         }
839 
840         // return condition when end of string found
841         addToken(tokenList, workArea.substring(0, trimStart));
842         return -1;
843     }
844 
845     /**
846      * Unsupported ListIterator operation.
847      *
848      * @throws UnsupportedOperationException always
849      */
850     @Override
851     public void remove() {
852         throw new UnsupportedOperationException("remove() is unsupported");
853     }
854 
855     /**
856      * Resets this tokenizer, forgetting all parsing and iteration already completed.
857      * <p>
858      * This method allows the same tokenizer to be reused for the same String.
859      *
860      * @return this, to enable chaining
861      */
862     public StrTokenizer reset() {
863         tokenPos = 0;
864         tokens = null;
865         return this;
866     }
867 
868     /**
869      * Reset this tokenizer, giving it a new input string to parse.
870      * In this manner you can re-use a tokenizer with the same settings
871      * on multiple input lines.
872      *
873      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
874      * @return this, to enable chaining
875      */
876     public StrTokenizer reset(final char[] input) {
877         reset();
878         if (input != null) {
879             this.chars = input.clone();
880         } else {
881             this.chars = null;
882         }
883         return this;
884     }
885 
886     /**
887      * Reset this tokenizer, giving it a new input string to parse.
888      * In this manner you can re-use a tokenizer with the same settings
889      * on multiple input lines.
890      *
891      * @param input  the new string to tokenize, null sets no text to parse
892      * @return this, to enable chaining
893      */
894     public StrTokenizer reset(final String input) {
895         reset();
896         if (input != null) {
897             this.chars = input.toCharArray();
898         } else {
899             this.chars = null;
900         }
901         return this;
902     }
903 
904     /**
905      * Unsupported ListIterator operation.
906      * @param obj this parameter ignored.
907      * @throws UnsupportedOperationException always
908      */
909     @Override
910     public void set(final String obj) {
911         throw new UnsupportedOperationException("set() is unsupported");
912     }
913 
914     /**
915      * Sets the field delimiter character.
916      *
917      * @param delim  the delimiter character to use
918      * @return this, to enable chaining
919      */
920     public StrTokenizer setDelimiterChar(final char delim) {
921         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
922     }
923 
924     /**
925      * Sets the field delimiter matcher.
926      * <p>
927      * The delimiter is used to separate one token from another.
928      * </p>
929      *
930      * @param delim  the delimiter matcher to use
931      * @return this, to enable chaining
932      */
933     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
934         if (delim == null) {
935             this.delimMatcher = StrMatcher.noneMatcher();
936         } else {
937             this.delimMatcher = delim;
938         }
939         return this;
940     }
941 
942     /**
943      * Sets the field delimiter string.
944      *
945      * @param delim  the delimiter string to use
946      * @return this, to enable chaining
947      */
948     public StrTokenizer setDelimiterString(final String delim) {
949         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
950     }
951 
952     /**
953      * Sets whether the tokenizer should return empty tokens as null.
954      * The default for this property is false.
955      *
956      * @param emptyAsNull  whether empty tokens are returned as null
957      * @return this, to enable chaining
958      */
959     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
960         this.emptyAsNull = emptyAsNull;
961         return this;
962     }
963 
964     /**
965      * Sets the character to ignore.
966      * <p>
967      * This character is ignored when parsing the String, unless it is
968      * within a quoted region.
969      * </p>
970      *
971      * @param ignored  the ignored character to use
972      * @return this, to enable chaining
973      */
974     public StrTokenizer setIgnoredChar(final char ignored) {
975         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
976     }
977 
978     /**
979      * Sets the matcher for characters to ignore.
980      * <p>
981      * These characters are ignored when parsing the String, unless they are
982      * within a quoted region.
983      * </p>
984      *
985      * @param ignored  the ignored matcher to use, null ignored
986      * @return this, to enable chaining
987      */
988     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
989         if (ignored != null) {
990             this.ignoredMatcher = ignored;
991         }
992         return this;
993     }
994 
995     /**
996      * Sets whether the tokenizer should ignore and not return empty tokens.
997      * The default for this property is true.
998      *
999      * @param ignoreEmptyTokens  whether empty tokens are not returned
1000      * @return this, to enable chaining
1001      */
1002     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1003         this.ignoreEmptyTokens = ignoreEmptyTokens;
1004         return this;
1005     }
1006 
1007     /**
1008      * Sets the quote character to use.
1009      * <p>
1010      * The quote character is used to wrap data between the tokens.
1011      * This enables delimiters to be entered as data.
1012      * </p>
1013      *
1014      * @param quote  the quote character to use
1015      * @return this, to enable chaining
1016      */
1017     public StrTokenizer setQuoteChar(final char quote) {
1018         return setQuoteMatcher(StrMatcher.charMatcher(quote));
1019     }
1020 
1021     /**
1022      * Sets the quote matcher to use.
1023      * <p>
1024      * The quote character is used to wrap data between the tokens.
1025      * This enables delimiters to be entered as data.
1026      * </p>
1027      *
1028      * @param quote  the quote matcher to use, null ignored
1029      * @return this, to enable chaining
1030      */
1031     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1032         if (quote != null) {
1033             this.quoteMatcher = quote;
1034         }
1035         return this;
1036     }
1037 
1038     /**
1039      * Sets the matcher for characters to trim.
1040      * <p>
1041      * These characters are trimmed off on each side of the delimiter
1042      * until the token or quote is found.
1043      * </p>
1044      *
1045      * @param trimmer  the trimmer matcher to use, null ignored
1046      * @return this, to enable chaining
1047      */
1048     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1049         if (trimmer != null) {
1050             this.trimmerMatcher = trimmer;
1051         }
1052         return this;
1053     }
1054 
1055     /**
1056      * Gets the number of tokens found in the String.
1057      *
1058      * @return The number of matched tokens
1059      */
1060     public int size() {
1061         checkTokenized();
1062         return tokens.length;
1063     }
1064 
1065     /**
1066      * Internal method to performs the tokenization.
1067      * <p>
1068      * Most users of this class do not need to call this method. This method
1069      * will be called automatically by other (public) methods when required.
1070      * </p>
1071      * <p>
1072      * This method exists to allow subclasses to add code before or after the
1073      * tokenization. For example, a subclass could alter the character array,
1074      * offset or count to be parsed, or call the tokenizer multiple times on
1075      * multiple strings. It is also be possible to filter the results.
1076      * </p>
1077      * <p>
1078      * {@code StrTokenizer} will always pass a zero offset and a count
1079      * equal to the length of the array to this method, however a subclass
1080      * may pass other values, or even an entirely different array.
1081      * </p>
1082      *
1083      * @param srcChars  the character array being tokenized, may be null
1084      * @param offset  the start position within the character array, must be valid
1085      * @param count  the number of characters to tokenize, must be valid
1086      * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1087      */
1088     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1089         if (srcChars == null || count == 0) {
1090             return Collections.emptyList();
1091         }
1092         final StrBuilder buf = new StrBuilder();
1093         final List<String> tokenList = new ArrayList<>();
1094         int pos = offset;
1095 
1096         // loop around the entire buffer
1097         while (pos >= 0 && pos < count) {
1098             // find next token
1099             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1100 
1101             // handle case where end of string is a delimiter
1102             if (pos >= count) {
1103                 addToken(tokenList, StringUtils.EMPTY);
1104             }
1105         }
1106         return tokenList;
1107     }
1108 
1109     /**
1110      * Gets the String content that the tokenizer is parsing.
1111      *
1112      * @return The string content being parsed
1113      */
1114     @Override
1115     public String toString() {
1116         if (tokens == null) {
1117             return "StrTokenizer[not tokenized yet]";
1118         }
1119         return "StrTokenizer" + getTokenList();
1120     }
1121 
1122 }