Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025import java.util.StringTokenizer;
026
027import org.apache.commons.lang3.ArrayUtils;
028import org.apache.commons.lang3.StringUtils;
029
030/**
031 * Tokenizes a string based on delimiters (separators)
032 * and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims
035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
036 * however it offers much more control and flexibility including implementing
037 * the {@link ListIterator} interface. By default, it is set up
038 * like {@link StringTokenizer}.
039 * </p>
040 * <p>
041 * The input String is split into a number of <em>tokens</em>.
042 * Each token is separated from the next String by a <em>delimiter</em>.
043 * One or more delimiter characters must be specified.
044 * </p>
045 * <p>
046 * Each token may be surrounded by quotes.
047 * The <em>quote</em> matcher specifies the quote character(s).
048 * A quote may be escaped within a quoted section by duplicating itself.
049 * </p>
050 * <p>
051 * Between each token and the delimiter are potentially characters that need trimming.
052 * The <em>trimmer</em> matcher specifies these characters.
053 * One usage might be to trim whitespace characters.
054 * </p>
055 * <p>
056 * At any point outside the quotes there might potentially be invalid characters.
057 * The <em>ignored</em> matcher specifies these characters to be removed.
058 * One usage might be to remove new line characters.
059 * </p>
060 * <p>
061 * Empty tokens may be removed or returned as null.
062 * </p>
063 * <pre>
064 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
065 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
067 * </pre>
068 *
069 * <table>
070 *  <caption>StrTokenizer properties and options</caption>
071 *  <tr>
072 *   <th>Property</th><th>Type</th><th>Default</th>
073 *  </tr>
074 *  <tr>
075 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
082 *  </tr>
083 *  <tr>
084 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
085 *  </tr>
086 *  <tr>
087 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
088 *  </tr>
089 * </table>
090 *
091 * @since 2.2
092 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
094 * StringTokenizer</a>.
095 */
096@Deprecated
097public class StrTokenizer implements ListIterator<String>, Cloneable {
098
099    // @formatter:off
100    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101            .setDelimiterMatcher(StrMatcher.commaMatcher())
102            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103            .setIgnoredMatcher(StrMatcher.noneMatcher())
104            .setTrimmerMatcher(StrMatcher.trimMatcher())
105            .setEmptyTokenAsNull(false)
106            .setIgnoreEmptyTokens(false);
107    // @formatter:on
108
109    // @formatter:off
110    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111            .setDelimiterMatcher(StrMatcher.tabMatcher())
112            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113            .setIgnoredMatcher(StrMatcher.noneMatcher())
114            .setTrimmerMatcher(StrMatcher.trimMatcher())
115            .setEmptyTokenAsNull(false)
116            .setIgnoreEmptyTokens(false);
117    // @formatter:on
118
119    /**
120     * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121     *
122     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123     */
124    private static StrTokenizer getCSVClone() {
125        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126    }
127    /**
128     * Gets a new tokenizer instance which parses Comma Separated Value strings
129     * initializing it with the given input.  The default for CSV processing
130     * will be trim whitespace from both ends (which can be overridden with
131     * the setTrimmer method).
132     * <p>
133     * You must call a "reset" method to set the string which you want to parse.
134     * </p>
135     * @return a new tokenizer instance which parses Comma Separated Value strings
136     */
137    public static StrTokenizer getCSVInstance() {
138        return getCSVClone();
139    }
140    /**
141     * Gets a new tokenizer instance which parses Comma Separated Value strings
142     * initializing it with the given input.  The default for CSV processing
143     * will be trim whitespace from both ends (which can be overridden with
144     * the setTrimmer method).
145     *
146     * @param input  the text to parse
147     * @return a new tokenizer instance which parses Comma Separated Value strings
148     */
149    public static StrTokenizer getCSVInstance(final char[] input) {
150        final StrTokenizer tok = getCSVClone();
151        tok.reset(input);
152        return tok;
153    }
154
155    /**
156     * Gets a new tokenizer instance which parses Comma Separated Value strings
157     * initializing it with the given input.  The default for CSV processing
158     * will be trim whitespace from both ends (which can be overridden with
159     * the setTrimmer method).
160     *
161     * @param input  the text to parse
162     * @return a new tokenizer instance which parses Comma Separated Value strings
163     */
164    public static StrTokenizer getCSVInstance(final String input) {
165        final StrTokenizer tok = getCSVClone();
166        tok.reset(input);
167        return tok;
168    }
169    /**
170     * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171     *
172     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173     */
174    private static StrTokenizer getTSVClone() {
175        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176    }
177
178    /**
179     * Gets a new tokenizer instance which parses Tab Separated Value strings.
180     * The default for CSV processing will be trim whitespace from both ends
181     * (which can be overridden with the setTrimmer method).
182     * <p>
183     * You must call a "reset" method to set the string which you want to parse.
184     * </p>
185     * @return a new tokenizer instance which parses Tab Separated Value strings.
186     */
187    public static StrTokenizer getTSVInstance() {
188        return getTSVClone();
189    }
190
191    /**
192     * Gets a new tokenizer instance which parses Tab Separated Value strings.
193     * The default for CSV processing will be trim whitespace from both ends
194     * (which can be overridden with the setTrimmer method).
195     * @param input  the string to parse
196     * @return a new tokenizer instance which parses Tab Separated Value strings.
197     */
198    public static StrTokenizer getTSVInstance(final char[] input) {
199        final StrTokenizer tok = getTSVClone();
200        tok.reset(input);
201        return tok;
202    }
203
204    /**
205     * Gets a new tokenizer instance which parses Tab Separated Value strings.
206     * The default for CSV processing will be trim whitespace from both ends
207     * (which can be overridden with the setTrimmer method).
208     * @param input  the string to parse
209     * @return a new tokenizer instance which parses Tab Separated Value strings.
210     */
211    public static StrTokenizer getTSVInstance(final String input) {
212        final StrTokenizer tok = getTSVClone();
213        tok.reset(input);
214        return tok;
215    }
216    /** The text to work on. */
217    private char[] chars;
218
219    /** The parsed tokens */
220    private String[] tokens;
221
222    /** The current iteration position */
223    private int tokenPos;
224
225    /** The delimiter matcher */
226    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
227
228    /** The quote matcher */
229    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
230
231    /** The ignored matcher */
232    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
233
234    /** The trimmer matcher */
235    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
236
237    /** Whether to return empty tokens as null */
238    private boolean emptyAsNull;
239
240    /** Whether to ignore empty tokens */
241    private boolean ignoreEmptyTokens = true;
242
243    /**
244     * Constructs a tokenizer splitting on space, tab, newline and formfeed
245     * as per StringTokenizer, but with no text to tokenize.
246     * <p>
247     * This constructor is normally used with {@link #reset(String)}.
248     * </p>
249     */
250    public StrTokenizer() {
251        this.chars = null;
252    }
253
254    /**
255     * Constructs a tokenizer splitting on space, tab, newline and formfeed
256     * as per StringTokenizer.
257     *
258     * @param input  the string which is to be parsed, not cloned
259     */
260    public StrTokenizer(final char[] input) {
261        this.chars = ArrayUtils.clone(input);
262    }
263
264    /**
265     * Constructs a tokenizer splitting on the specified character.
266     *
267     * @param input  the string which is to be parsed, not cloned
268     * @param delim the field delimiter character
269     */
270    public StrTokenizer(final char[] input, final char delim) {
271        this(input);
272        setDelimiterChar(delim);
273    }
274
275    /**
276     * Constructs a tokenizer splitting on the specified delimiter character
277     * and handling quotes using the specified quote character.
278     *
279     * @param input  the string which is to be parsed, not cloned
280     * @param delim  the field delimiter character
281     * @param quote  the field quoted string character
282     */
283    public StrTokenizer(final char[] input, final char delim, final char quote) {
284        this(input, delim);
285        setQuoteChar(quote);
286    }
287
288    /**
289     * Constructs a tokenizer splitting on the specified string.
290     *
291     * @param input  the string which is to be parsed, not cloned
292     * @param delim the field delimiter string
293     */
294    public StrTokenizer(final char[] input, final String delim) {
295        this(input);
296        setDelimiterString(delim);
297    }
298
299    /**
300     * Constructs a tokenizer splitting using the specified delimiter matcher.
301     *
302     * @param input  the string which is to be parsed, not cloned
303     * @param delim  the field delimiter matcher
304     */
305    public StrTokenizer(final char[] input, final StrMatcher delim) {
306        this(input);
307        setDelimiterMatcher(delim);
308    }
309
310    /**
311     * Constructs a tokenizer splitting using the specified delimiter matcher
312     * and handling quotes using the specified quote matcher.
313     *
314     * @param input  the string which is to be parsed, not cloned
315     * @param delim  the field delimiter character
316     * @param quote  the field quoted string character
317     */
318    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
319        this(input, delim);
320        setQuoteMatcher(quote);
321    }
322
323    /**
324     * Constructs a tokenizer splitting on space, tab, newline and formfeed
325     * as per StringTokenizer.
326     *
327     * @param input  the string which is to be parsed
328     */
329    public StrTokenizer(final String input) {
330        if (input != null) {
331            chars = input.toCharArray();
332        } else {
333            chars = null;
334        }
335    }
336
337    /**
338     * Constructs a tokenizer splitting on the specified delimiter character.
339     *
340     * @param input  the string which is to be parsed
341     * @param delim  the field delimiter character
342     */
343    public StrTokenizer(final String input, final char delim) {
344        this(input);
345        setDelimiterChar(delim);
346    }
347
348    /**
349     * Constructs a tokenizer splitting on the specified delimiter character
350     * and handling quotes using the specified quote character.
351     *
352     * @param input  the string which is to be parsed
353     * @param delim  the field delimiter character
354     * @param quote  the field quoted string character
355     */
356    public StrTokenizer(final String input, final char delim, final char quote) {
357        this(input, delim);
358        setQuoteChar(quote);
359    }
360
361    /**
362     * Constructs a tokenizer splitting on the specified delimiter string.
363     *
364     * @param input  the string which is to be parsed
365     * @param delim  the field delimiter string
366     */
367    public StrTokenizer(final String input, final String delim) {
368        this(input);
369        setDelimiterString(delim);
370    }
371
372    /**
373     * Constructs a tokenizer splitting using the specified delimiter matcher.
374     *
375     * @param input  the string which is to be parsed
376     * @param delim  the field delimiter matcher
377     */
378    public StrTokenizer(final String input, final StrMatcher delim) {
379        this(input);
380        setDelimiterMatcher(delim);
381    }
382
383    /**
384     * Constructs a tokenizer splitting using the specified delimiter matcher
385     * and handling quotes using the specified quote matcher.
386     *
387     * @param input  the string which is to be parsed
388     * @param delim  the field delimiter matcher
389     * @param quote  the field quoted string matcher
390     */
391    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
392        this(input, delim);
393        setQuoteMatcher(quote);
394    }
395
396    /**
397     * Unsupported ListIterator operation.
398     * @param obj this parameter ignored.
399     * @throws UnsupportedOperationException always
400     */
401    @Override
402    public void add(final String obj) {
403        throw new UnsupportedOperationException("add() is unsupported");
404    }
405
406    /**
407     * Adds a token to a list, paying attention to the parameters we've set.
408     *
409     * @param list  the list to add to
410     * @param tok  the token to add
411     */
412    private void addToken(final List<String> list, String tok) {
413        if (StringUtils.isEmpty(tok)) {
414            if (isIgnoreEmptyTokens()) {
415                return;
416            }
417            if (isEmptyTokenAsNull()) {
418                tok = null;
419            }
420        }
421        list.add(tok);
422    }
423
424    /**
425     * Checks if tokenization has been done, and if not then do it.
426     */
427    private void checkTokenized() {
428        if (tokens == null) {
429            if (chars == null) {
430                // still call tokenize as subclass may do some work
431                final List<String> split = tokenize(null, 0, 0);
432                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
433            } else {
434                final List<String> split = tokenize(chars, 0, chars.length);
435                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
436            }
437        }
438    }
439
440    /**
441     * Creates a new instance of this Tokenizer. The new instance is reset so
442     * that it will be at the start of the token list.
443     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
444     *
445     * @return a new instance of this Tokenizer which has been reset.
446     */
447    @Override
448    public Object clone() {
449        try {
450            return cloneReset();
451        } catch (final CloneNotSupportedException ex) {
452            return null;
453        }
454    }
455
456    /**
457     * Creates a new instance of this Tokenizer. The new instance is reset so that
458     * it will be at the start of the token list.
459     *
460     * @return a new instance of this Tokenizer which has been reset.
461     * @throws CloneNotSupportedException if there is a problem cloning
462     */
463    Object cloneReset() throws CloneNotSupportedException {
464        // this method exists to enable 100% test coverage
465        final StrTokenizer cloned = (StrTokenizer) super.clone();
466        if (cloned.chars != null) {
467            cloned.chars = cloned.chars.clone();
468        }
469        cloned.reset();
470        return cloned;
471    }
472
473    /**
474     * Gets the String content that the tokenizer is parsing.
475     *
476     * @return the string content being parsed
477     */
478    public String getContent() {
479        if (chars == null) {
480            return null;
481        }
482        return new String(chars);
483    }
484
485    /**
486     * Gets the field delimiter matcher.
487     *
488     * @return the delimiter matcher in use
489     */
490    public StrMatcher getDelimiterMatcher() {
491        return this.delimMatcher;
492    }
493
494    // Ignored
495    /**
496     * Gets the ignored character matcher.
497     * <p>
498     * These characters are ignored when parsing the String, unless they are
499     * within a quoted region.
500     * The default value is not to ignore anything.
501     * </p>
502     *
503     * @return the ignored matcher in use
504     */
505    public StrMatcher getIgnoredMatcher() {
506        return ignoredMatcher;
507    }
508
509    /**
510     * Gets the quote matcher currently in use.
511     * <p>
512     * The quote character is used to wrap data between the tokens.
513     * This enables delimiters to be entered as data.
514     * The default value is '"' (double quote).
515     * </p>
516     *
517     * @return the quote matcher in use
518     */
519    public StrMatcher getQuoteMatcher() {
520        return quoteMatcher;
521    }
522
523    /**
524     * Gets a copy of the full token list as an independent modifiable array.
525     *
526     * @return the tokens as a String array
527     */
528    public String[] getTokenArray() {
529        checkTokenized();
530        return tokens.clone();
531    }
532
533    /**
534     * Gets a copy of the full token list as an independent modifiable list.
535     *
536     * @return the tokens as a String array
537     */
538    public List<String> getTokenList() {
539        checkTokenized();
540        final List<String> list = new ArrayList<>(tokens.length);
541        list.addAll(Arrays.asList(tokens));
542        return list;
543    }
544
545    /**
546     * Gets the trimmer character matcher.
547     * <p>
548     * These characters are trimmed off on each side of the delimiter
549     * until the token or quote is found.
550     * The default value is not to trim anything.
551     * </p>
552     *
553     * @return the trimmer matcher in use
554     */
555    public StrMatcher getTrimmerMatcher() {
556        return trimmerMatcher;
557    }
558
559    /**
560     * Checks whether there are any more tokens.
561     *
562     * @return true if there are more tokens
563     */
564    @Override
565    public boolean hasNext() {
566        checkTokenized();
567        return tokenPos < tokens.length;
568    }
569
570    /**
571     * Checks whether there are any previous tokens that can be iterated to.
572     *
573     * @return true if there are previous tokens
574     */
575    @Override
576    public boolean hasPrevious() {
577        checkTokenized();
578        return tokenPos > 0;
579    }
580
581    /**
582     * Gets whether the tokenizer currently returns empty tokens as null.
583     * The default for this property is false.
584     *
585     * @return true if empty tokens are returned as null
586     */
587    public boolean isEmptyTokenAsNull() {
588        return this.emptyAsNull;
589    }
590
591    /**
592     * Gets whether the tokenizer currently ignores empty tokens.
593     * The default for this property is true.
594     *
595     * @return true if empty tokens are not returned
596     */
597    public boolean isIgnoreEmptyTokens() {
598        return ignoreEmptyTokens;
599    }
600
601    /**
602     * Checks if the characters at the index specified match the quote
603     * already matched in readNextToken().
604     *
605     * @param srcChars  the character array being tokenized
606     * @param pos  the position to check for a quote
607     * @param len  the length of the character array being tokenized
608     * @param quoteStart  the start position of the matched quote, 0 if no quoting
609     * @param quoteLen  the length of the matched quote, 0 if no quoting
610     * @return true if a quote is matched
611     */
612    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
613        for (int i = 0; i < quoteLen; i++) {
614            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
615                return false;
616            }
617        }
618        return true;
619    }
620
621    /**
622     * Gets the next token.
623     *
624     * @return the next String token
625     * @throws NoSuchElementException if there are no more elements
626     */
627    @Override
628    public String next() {
629        if (hasNext()) {
630            return tokens[tokenPos++];
631        }
632        throw new NoSuchElementException();
633    }
634
635    /**
636     * Gets the index of the next token to return.
637     *
638     * @return the next token index
639     */
640    @Override
641    public int nextIndex() {
642        return tokenPos;
643    }
644
645    /**
646     * Gets the next token from the String.
647     * Equivalent to {@link #next()} except it returns null rather than
648     * throwing {@link NoSuchElementException} when no tokens remain.
649     *
650     * @return the next sequential token, or null when no more tokens are found
651     */
652    public String nextToken() {
653        if (hasNext()) {
654            return tokens[tokenPos++];
655        }
656        return null;
657    }
658
659    /**
660     * Gets the token previous to the last returned token.
661     *
662     * @return the previous token
663     */
664    @Override
665    public String previous() {
666        if (hasPrevious()) {
667            return tokens[--tokenPos];
668        }
669        throw new NoSuchElementException();
670    }
671
672    /**
673     * Gets the index of the previous token.
674     *
675     * @return the previous token index
676     */
677    @Override
678    public int previousIndex() {
679        return tokenPos - 1;
680    }
681
682    /**
683     * Gets the previous token from the String.
684     *
685     * @return the previous sequential token, or null when no more tokens are found
686     */
687    public String previousToken() {
688        if (hasPrevious()) {
689            return tokens[--tokenPos];
690        }
691        return null;
692    }
693
694    /**
695     * Reads character by character through the String to get the next token.
696     *
697     * @param srcChars  the character array being tokenized
698     * @param start  the first character of field
699     * @param len  the length of the character array being tokenized
700     * @param workArea  a temporary work area
701     * @param tokenList  the list of parsed tokens
702     * @return the starting position of the next field (the character
703     *  immediately after the delimiter), or -1 if end of string found
704     */
705    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
706        // skip all leading whitespace, unless it is the
707        // field delimiter or the quote character
708        while (start < len) {
709            final int removeLen = Math.max(
710                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
711                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
712            if (removeLen == 0 ||
713                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
714                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
715                break;
716            }
717            start += removeLen;
718        }
719
720        // handle reaching end
721        if (start >= len) {
722            addToken(tokenList, StringUtils.EMPTY);
723            return -1;
724        }
725
726        // handle empty token
727        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
728        if (delimLen > 0) {
729            addToken(tokenList, StringUtils.EMPTY);
730            return start + delimLen;
731        }
732
733        // handle found token
734        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
735        if (quoteLen > 0) {
736            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
737        }
738        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
739    }
740
741    /**
742     * Reads a possibly quoted string token.
743     *
744     * @param srcChars  the character array being tokenized
745     * @param start  the first character of field
746     * @param len  the length of the character array being tokenized
747     * @param workArea  a temporary work area
748     * @param tokenList  the list of parsed tokens
749     * @param quoteStart  the start position of the matched quote, 0 if no quoting
750     * @param quoteLen  the length of the matched quote, 0 if no quoting
751     * @return the starting position of the next field (the character
752     *  immediately after the delimiter, or if end of string found,
753     *  then the length of string
754     */
755    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
756                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
757        // Loop until we've found the end of the quoted
758        // string or the end of the input
759        workArea.clear();
760        int pos = start;
761        boolean quoting = quoteLen > 0;
762        int trimStart = 0;
763
764        while (pos < len) {
765            // quoting mode can occur several times throughout a string
766            // we must switch between quoting and non-quoting until we
767            // encounter a non-quoted delimiter, or end of string
768            if (quoting) {
769                // In quoting mode
770
771                // If we've found a quote character, see if it's
772                // followed by a second quote.  If so, then we need
773                // to actually put the quote character into the token
774                // rather than end the token.
775                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
776                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
777                        // matched pair of quotes, thus an escaped quote
778                        workArea.append(srcChars, pos, quoteLen);
779                        pos += quoteLen * 2;
780                        trimStart = workArea.size();
781                        continue;
782                    }
783
784                    // end of quoting
785                    quoting = false;
786                    pos += quoteLen;
787                    continue;
788                }
789
790            } else {
791                // Not in quoting mode
792
793                // check for delimiter, and thus end of token
794                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
795                if (delimLen > 0) {
796                    // return condition when end of token found
797                    addToken(tokenList, workArea.substring(0, trimStart));
798                    return pos + delimLen;
799                }
800
801                // check for quote, and thus back into quoting mode
802                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
803                    quoting = true;
804                    pos += quoteLen;
805                    continue;
806                }
807
808                // check for ignored (outside quotes), and ignore
809                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
810                if (ignoredLen > 0) {
811                    pos += ignoredLen;
812                    continue;
813                }
814
815                // check for trimmed character
816                // don't yet know if it's at the end, so copy to workArea
817                // use trimStart to keep track of trim at the end
818                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
819                if (trimmedLen > 0) {
820                    workArea.append(srcChars, pos, trimmedLen);
821                    pos += trimmedLen;
822                    continue;
823                }
824            }
825            // copy regular character from inside quotes
826            workArea.append(srcChars[pos++]);
827            trimStart = workArea.size();
828        }
829
830        // return condition when end of string found
831        addToken(tokenList, workArea.substring(0, trimStart));
832        return -1;
833    }
834
835    /**
836     * Unsupported ListIterator operation.
837     *
838     * @throws UnsupportedOperationException always
839     */
840    @Override
841    public void remove() {
842        throw new UnsupportedOperationException("remove() is unsupported");
843    }
844
845    /**
846     * Resets this tokenizer, forgetting all parsing and iteration already completed.
847     * <p>
848     * This method allows the same tokenizer to be reused for the same String.
849     * </p>
850     *
851     * @return this, to enable chaining
852     */
853    public StrTokenizer reset() {
854        tokenPos = 0;
855        tokens = null;
856        return this;
857    }
858
859    /**
860     * Reset this tokenizer, giving it a new input string to parse.
861     * In this manner you can re-use a tokenizer with the same settings
862     * on multiple input lines.
863     *
864     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
865     * @return this, to enable chaining
866     */
867    public StrTokenizer reset(final char[] input) {
868        reset();
869        this.chars = ArrayUtils.clone(input);
870        return this;
871    }
872
873    /**
874     * Reset this tokenizer, giving it a new input string to parse.
875     * In this manner you can re-use a tokenizer with the same settings
876     * on multiple input lines.
877     *
878     * @param input  the new string to tokenize, null sets no text to parse
879     * @return this, to enable chaining
880     */
881    public StrTokenizer reset(final String input) {
882        reset();
883        if (input != null) {
884            this.chars = input.toCharArray();
885        } else {
886            this.chars = null;
887        }
888        return this;
889    }
890
891    /**
892     * Unsupported ListIterator operation.
893     * @param obj this parameter ignored.
894     * @throws UnsupportedOperationException always
895     */
896    @Override
897    public void set(final String obj) {
898        throw new UnsupportedOperationException("set() is unsupported");
899    }
900
901    /**
902     * Sets the field delimiter character.
903     *
904     * @param delim  the delimiter character to use
905     * @return this, to enable chaining
906     */
907    public StrTokenizer setDelimiterChar(final char delim) {
908        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
909    }
910
911    /**
912     * Sets the field delimiter matcher.
913     * <p>
914     * The delimiter is used to separate one token from another.
915     * </p>
916     *
917     * @param delim  the delimiter matcher to use
918     * @return this, to enable chaining
919     */
920    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
921        if (delim == null) {
922            this.delimMatcher = StrMatcher.noneMatcher();
923        } else {
924            this.delimMatcher = delim;
925        }
926        return this;
927    }
928
929    /**
930     * Sets the field delimiter string.
931     *
932     * @param delim  the delimiter string to use
933     * @return this, to enable chaining
934     */
935    public StrTokenizer setDelimiterString(final String delim) {
936        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
937    }
938
939    /**
940     * Sets whether the tokenizer should return empty tokens as null.
941     * The default for this property is false.
942     *
943     * @param emptyAsNull  whether empty tokens are returned as null
944     * @return this, to enable chaining
945     */
946    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
947        this.emptyAsNull = emptyAsNull;
948        return this;
949    }
950
951    /**
952     * Sets the character to ignore.
953     * <p>
954     * This character is ignored when parsing the String, unless it is
955     * within a quoted region.
956     *
957     * @param ignored  the ignored character to use
958     * @return this, to enable chaining
959     */
960    public StrTokenizer setIgnoredChar(final char ignored) {
961        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
962    }
963
964    /**
965     * Sets the matcher for characters to ignore.
966     * <p>
967     * These characters are ignored when parsing the String, unless they are
968     * within a quoted region.
969     * </p>
970     *
971     * @param ignored  the ignored matcher to use, null ignored
972     * @return this, to enable chaining
973     */
974    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
975        if (ignored != null) {
976            this.ignoredMatcher = ignored;
977        }
978        return this;
979    }
980
981    /**
982     * Sets whether the tokenizer should ignore and not return empty tokens.
983     * The default for this property is true.
984     *
985     * @param ignoreEmptyTokens  whether empty tokens are not returned
986     * @return this, to enable chaining
987     */
988    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
989        this.ignoreEmptyTokens = ignoreEmptyTokens;
990        return this;
991    }
992
993    /**
994     * Sets the quote character to use.
995     * <p>
996     * The quote character is used to wrap data between the tokens.
997     * This enables delimiters to be entered as data.
998     * </p>
999     *
1000     * @param quote  the quote character to use
1001     * @return this, to enable chaining
1002     */
1003    public StrTokenizer setQuoteChar(final char quote) {
1004        return setQuoteMatcher(StrMatcher.charMatcher(quote));
1005    }
1006
1007    /**
1008     * Sets the quote matcher to use.
1009     * <p>
1010     * The quote character is used to wrap data between the tokens.
1011     * This enables delimiters to be entered as data.
1012     * </p>
1013     *
1014     * @param quote  the quote matcher to use, null ignored
1015     * @return this, to enable chaining
1016     */
1017    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1018        if (quote != null) {
1019            this.quoteMatcher = quote;
1020        }
1021        return this;
1022    }
1023
1024    /**
1025     * Sets the matcher for characters to trim.
1026     * <p>
1027     * These characters are trimmed off on each side of the delimiter
1028     * until the token or quote is found.
1029     * </p>
1030     *
1031     * @param trimmer  the trimmer matcher to use, null ignored
1032     * @return this, to enable chaining
1033     */
1034    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1035        if (trimmer != null) {
1036            this.trimmerMatcher = trimmer;
1037        }
1038        return this;
1039    }
1040
1041    // API
1042    /**
1043     * Gets the number of tokens found in the String.
1044     *
1045     * @return the number of matched tokens
1046     */
1047    public int size() {
1048        checkTokenized();
1049        return tokens.length;
1050    }
1051
1052    /**
1053     * Internal method to performs the tokenization.
1054     * <p>
1055     * Most users of this class do not need to call this method. This method
1056     * will be called automatically by other (public) methods when required.
1057     * </p>
1058     * <p>
1059     * This method exists to allow subclasses to add code before or after the
1060     * tokenization. For example, a subclass could alter the character array,
1061     * offset or count to be parsed, or call the tokenizer multiple times on
1062     * multiple strings. It is also be possible to filter the results.
1063     * </p>
1064     * <p>
1065     * {@link StrTokenizer} will always pass a zero offset and a count
1066     * equal to the length of the array to this method, however a subclass
1067     * may pass other values, or even an entirely different array.
1068     * </p>
1069     *
1070     * @param srcChars  the character array being tokenized, may be null
1071     * @param offset  the start position within the character array, must be valid
1072     * @param count  the number of characters to tokenize, must be valid
1073     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
1074     */
1075    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1076        if (ArrayUtils.isEmpty(srcChars)) {
1077            return Collections.emptyList();
1078        }
1079        final StrBuilder buf = new StrBuilder();
1080        final List<String> tokenList = new ArrayList<>();
1081        int pos = offset;
1082
1083        // loop around the entire buffer
1084        while (pos >= 0 && pos < count) {
1085            // find next token
1086            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1087
1088            // handle case where end of string is a delimiter
1089            if (pos >= count) {
1090                addToken(tokenList, StringUtils.EMPTY);
1091            }
1092        }
1093        return tokenList;
1094    }
1095
1096    /**
1097     * Gets the String content that the tokenizer is parsing.
1098     *
1099     * @return the string content being parsed
1100     */
1101    @Override
1102    public String toString() {
1103        if (tokens == null) {
1104            return "StrTokenizer[not tokenized yet]";
1105        }
1106        return "StrTokenizer" + getTokenList();
1107    }
1108
1109}