001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.ArrayUtils;
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * Tokenizes a string based based on delimiters (separators)
030 * and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims
033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
034 * however it offers much more control and flexibility including implementing
035 * the <code>ListIterator</code> interface. By default, it is set up
036 * like <code>StringTokenizer</code>.
037 * <p>
038 * The input String is split into a number of <i>tokens</i>.
039 * Each token is separated from the next String by a <i>delimiter</i>.
040 * One or more delimiter characters must be specified.
041 * <p>
042 * Each token may be surrounded by quotes.
043 * The <i>quote</i> matcher specifies the quote character(s).
044 * A quote may be escaped within a quoted section by duplicating itself.
045 * <p>
046 * Between each token and the delimiter are potentially characters that need trimming.
047 * The <i>trimmer</i> matcher specifies these characters.
048 * One usage might be to trim whitespace characters.
049 * <p>
050 * At any point outside the quotes there might potentially be invalid characters.
051 * The <i>ignored</i> matcher specifies these characters to be removed.
052 * One usage might be to remove new line characters.
053 * <p>
054 * Empty tokens may be removed or returned as null.
055 * <pre>
056 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
057 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
059 * </pre>
060 * <p>
061 *
062 * This tokenizer has the following properties and options:
063 *
064 * <table summary="Tokenizer Properties">
065 *  <tr>
066 *   <th>Property</th><th>Type</th><th>Default</th>
067 *  </tr>
068 *  <tr>
069 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
070 *  </tr>
071 *  <tr>
072 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
073 *  </tr>
074 *  <tr>
075 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
082 *  </tr>
083 * </table>
084 *
085 * @since 2.2
086 * @deprecated as of 3.6, use commons-text
087 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StrTokenizer.html">
088 * StrTokenizer</a> instead
089 */
090@Deprecated
091public class StrTokenizer implements ListIterator<String>, Cloneable {
092
093    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
094    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
095    static {
096        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
097        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
098        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
099        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
100        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
101        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
102        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
103
104        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
105        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
106        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
107        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
108        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
109        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
110        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
111    }
112
113    /** The text to work on. */
114    private char chars[];
115    /** The parsed tokens */
116    private String tokens[];
117    /** The current iteration position */
118    private int tokenPos;
119
120    /** The delimiter matcher */
121    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
122    /** The quote matcher */
123    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
124    /** The ignored matcher */
125    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
126    /** The trimmer matcher */
127    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
128
129    /** Whether to return empty tokens as null */
130    private boolean emptyAsNull = false;
131    /** Whether to ignore empty tokens */
132    private boolean ignoreEmptyTokens = true;
133
134    //-----------------------------------------------------------------------
135
136    /**
137     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
138     *
139     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
140     */
141    private static StrTokenizer getCSVClone() {
142        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
143    }
144
145    /**
146     * Gets a new tokenizer instance which parses Comma Separated Value strings
147     * initializing it with the given input.  The default for CSV processing
148     * will be trim whitespace from both ends (which can be overridden with
149     * the setTrimmer method).
150     * <p>
151     * You must call a "reset" method to set the string which you want to parse.
152     * @return a new tokenizer instance which parses Comma Separated Value strings
153     */
154    public static StrTokenizer getCSVInstance() {
155        return getCSVClone();
156    }
157
158    /**
159     * Gets a new tokenizer instance which parses Comma Separated Value strings
160     * initializing it with the given input.  The default for CSV processing
161     * will be trim whitespace from both ends (which can be overridden with
162     * the setTrimmer method).
163     *
164     * @param input  the text to parse
165     * @return a new tokenizer instance which parses Comma Separated Value strings
166     */
167    public static StrTokenizer getCSVInstance(final String input) {
168        final StrTokenizer tok = getCSVClone();
169        tok.reset(input);
170        return tok;
171    }
172
173    /**
174     * Gets a new tokenizer instance which parses Comma Separated Value strings
175     * initializing it with the given input.  The default for CSV processing
176     * will be trim whitespace from both ends (which can be overridden with
177     * the setTrimmer method).
178     *
179     * @param input  the text to parse
180     * @return a new tokenizer instance which parses Comma Separated Value strings
181     */
182    public static StrTokenizer getCSVInstance(final char[] input) {
183        final StrTokenizer tok = getCSVClone();
184        tok.reset(input);
185        return tok;
186    }
187
188    /**
189     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
190     *
191     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
192     */
193    private static StrTokenizer getTSVClone() {
194        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
195    }
196
197
198    /**
199     * Gets a new tokenizer instance which parses Tab Separated Value strings.
200     * The default for CSV processing will be trim whitespace from both ends
201     * (which can be overridden with the setTrimmer method).
202     * <p>
203     * You must call a "reset" method to set the string which you want to parse.
204     * @return a new tokenizer instance which parses Tab Separated Value strings.
205     */
206    public static StrTokenizer getTSVInstance() {
207        return getTSVClone();
208    }
209
210    /**
211     * Gets a new tokenizer instance which parses Tab Separated Value strings.
212     * The default for CSV processing will be trim whitespace from both ends
213     * (which can be overridden with the setTrimmer method).
214     * @param input  the string to parse
215     * @return a new tokenizer instance which parses Tab Separated Value strings.
216     */
217    public static StrTokenizer getTSVInstance(final String input) {
218        final StrTokenizer tok = getTSVClone();
219        tok.reset(input);
220        return tok;
221    }
222
223    /**
224     * Gets a new tokenizer instance which parses Tab Separated Value strings.
225     * The default for CSV processing will be trim whitespace from both ends
226     * (which can be overridden with the setTrimmer method).
227     * @param input  the string to parse
228     * @return a new tokenizer instance which parses Tab Separated Value strings.
229     */
230    public static StrTokenizer getTSVInstance(final char[] input) {
231        final StrTokenizer tok = getTSVClone();
232        tok.reset(input);
233        return tok;
234    }
235
236    //-----------------------------------------------------------------------
237    /**
238     * Constructs a tokenizer splitting on space, tab, newline and formfeed
239     * as per StringTokenizer, but with no text to tokenize.
240     * <p>
241     * This constructor is normally used with {@link #reset(String)}.
242     */
243    public StrTokenizer() {
244        super();
245        this.chars = null;
246    }
247
248    /**
249     * Constructs a tokenizer splitting on space, tab, newline and formfeed
250     * as per StringTokenizer.
251     *
252     * @param input  the string which is to be parsed
253     */
254    public StrTokenizer(final String input) {
255        super();
256        if (input != null) {
257            chars = input.toCharArray();
258        } else {
259            chars = null;
260        }
261    }
262
263    /**
264     * Constructs a tokenizer splitting on the specified delimiter character.
265     *
266     * @param input  the string which is to be parsed
267     * @param delim  the field delimiter character
268     */
269    public StrTokenizer(final String input, final char delim) {
270        this(input);
271        setDelimiterChar(delim);
272    }
273
274    /**
275     * Constructs a tokenizer splitting on the specified delimiter string.
276     *
277     * @param input  the string which is to be parsed
278     * @param delim  the field delimiter string
279     */
280    public StrTokenizer(final String input, final String delim) {
281        this(input);
282        setDelimiterString(delim);
283    }
284
285    /**
286     * Constructs a tokenizer splitting using the specified delimiter matcher.
287     *
288     * @param input  the string which is to be parsed
289     * @param delim  the field delimiter matcher
290     */
291    public StrTokenizer(final String input, final StrMatcher delim) {
292        this(input);
293        setDelimiterMatcher(delim);
294    }
295
296    /**
297     * Constructs a tokenizer splitting on the specified delimiter character
298     * and handling quotes using the specified quote character.
299     *
300     * @param input  the string which is to be parsed
301     * @param delim  the field delimiter character
302     * @param quote  the field quoted string character
303     */
304    public StrTokenizer(final String input, final char delim, final char quote) {
305        this(input, delim);
306        setQuoteChar(quote);
307    }
308
309    /**
310     * Constructs a tokenizer splitting using the specified delimiter matcher
311     * and handling quotes using the specified quote matcher.
312     *
313     * @param input  the string which is to be parsed
314     * @param delim  the field delimiter matcher
315     * @param quote  the field quoted string matcher
316     */
317    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
318        this(input, delim);
319        setQuoteMatcher(quote);
320    }
321
322    /**
323     * Constructs a tokenizer splitting on space, tab, newline and formfeed
324     * as per StringTokenizer.
325     *
326     * @param input  the string which is to be parsed, not cloned
327     */
328    public StrTokenizer(final char[] input) {
329        super();
330        this.chars = ArrayUtils.clone(input);
331    }
332
333    /**
334     * Constructs a tokenizer splitting on the specified character.
335     *
336     * @param input  the string which is to be parsed, not cloned
337     * @param delim the field delimiter character
338     */
339    public StrTokenizer(final char[] input, final char delim) {
340        this(input);
341        setDelimiterChar(delim);
342    }
343
344    /**
345     * Constructs a tokenizer splitting on the specified string.
346     *
347     * @param input  the string which is to be parsed, not cloned
348     * @param delim the field delimiter string
349     */
350    public StrTokenizer(final char[] input, final String delim) {
351        this(input);
352        setDelimiterString(delim);
353    }
354
355    /**
356     * Constructs a tokenizer splitting using the specified delimiter matcher.
357     *
358     * @param input  the string which is to be parsed, not cloned
359     * @param delim  the field delimiter matcher
360     */
361    public StrTokenizer(final char[] input, final StrMatcher delim) {
362        this(input);
363        setDelimiterMatcher(delim);
364    }
365
366    /**
367     * Constructs a tokenizer splitting on the specified delimiter character
368     * and handling quotes using the specified quote character.
369     *
370     * @param input  the string which is to be parsed, not cloned
371     * @param delim  the field delimiter character
372     * @param quote  the field quoted string character
373     */
374    public StrTokenizer(final char[] input, final char delim, final char quote) {
375        this(input, delim);
376        setQuoteChar(quote);
377    }
378
379    /**
380     * Constructs a tokenizer splitting using the specified delimiter matcher
381     * and handling quotes using the specified quote matcher.
382     *
383     * @param input  the string which is to be parsed, not cloned
384     * @param delim  the field delimiter character
385     * @param quote  the field quoted string character
386     */
387    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
388        this(input, delim);
389        setQuoteMatcher(quote);
390    }
391
392    // API
393    //-----------------------------------------------------------------------
394    /**
395     * Gets the number of tokens found in the String.
396     *
397     * @return the number of matched tokens
398     */
399    public int size() {
400        checkTokenized();
401        return tokens.length;
402    }
403
404    /**
405     * Gets the next token from the String.
406     * Equivalent to {@link #next()} except it returns null rather than
407     * throwing {@link NoSuchElementException} when no tokens remain.
408     *
409     * @return the next sequential token, or null when no more tokens are found
410     */
411    public String nextToken() {
412        if (hasNext()) {
413            return tokens[tokenPos++];
414        }
415        return null;
416    }
417
418    /**
419     * Gets the previous token from the String.
420     *
421     * @return the previous sequential token, or null when no more tokens are found
422     */
423    public String previousToken() {
424        if (hasPrevious()) {
425            return tokens[--tokenPos];
426        }
427        return null;
428    }
429
430    /**
431     * Gets a copy of the full token list as an independent modifiable array.
432     *
433     * @return the tokens as a String array
434     */
435    public String[] getTokenArray() {
436        checkTokenized();
437        return tokens.clone();
438    }
439
440    /**
441     * Gets a copy of the full token list as an independent modifiable list.
442     *
443     * @return the tokens as a String array
444     */
445    public List<String> getTokenList() {
446        checkTokenized();
447        final List<String> list = new ArrayList<>(tokens.length);
448        for (final String element : tokens) {
449            list.add(element);
450        }
451        return list;
452    }
453
454    /**
455     * Resets this tokenizer, forgetting all parsing and iteration already completed.
456     * <p>
457     * This method allows the same tokenizer to be reused for the same String.
458     *
459     * @return this, to enable chaining
460     */
461    public StrTokenizer reset() {
462        tokenPos = 0;
463        tokens = null;
464        return this;
465    }
466
467    /**
468     * Reset this tokenizer, giving it a new input string to parse.
469     * In this manner you can re-use a tokenizer with the same settings
470     * on multiple input lines.
471     *
472     * @param input  the new string to tokenize, null sets no text to parse
473     * @return this, to enable chaining
474     */
475    public StrTokenizer reset(final String input) {
476        reset();
477        if (input != null) {
478            this.chars = input.toCharArray();
479        } else {
480            this.chars = null;
481        }
482        return this;
483    }
484
485    /**
486     * Reset this tokenizer, giving it a new input string to parse.
487     * In this manner you can re-use a tokenizer with the same settings
488     * on multiple input lines.
489     *
490     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
491     * @return this, to enable chaining
492     */
493    public StrTokenizer reset(final char[] input) {
494        reset();
495        this.chars = ArrayUtils.clone(input);
496        return this;
497    }
498
499    // ListIterator
500    //-----------------------------------------------------------------------
501    /**
502     * Checks whether there are any more tokens.
503     *
504     * @return true if there are more tokens
505     */
506    @Override
507    public boolean hasNext() {
508        checkTokenized();
509        return tokenPos < tokens.length;
510    }
511
512    /**
513     * Gets the next token.
514     *
515     * @return the next String token
516     * @throws NoSuchElementException if there are no more elements
517     */
518    @Override
519    public String next() {
520        if (hasNext()) {
521            return tokens[tokenPos++];
522        }
523        throw new NoSuchElementException();
524    }
525
526    /**
527     * Gets the index of the next token to return.
528     *
529     * @return the next token index
530     */
531    @Override
532    public int nextIndex() {
533        return tokenPos;
534    }
535
536    /**
537     * Checks whether there are any previous tokens that can be iterated to.
538     *
539     * @return true if there are previous tokens
540     */
541    @Override
542    public boolean hasPrevious() {
543        checkTokenized();
544        return tokenPos > 0;
545    }
546
547    /**
548     * Gets the token previous to the last returned token.
549     *
550     * @return the previous token
551     */
552    @Override
553    public String previous() {
554        if (hasPrevious()) {
555            return tokens[--tokenPos];
556        }
557        throw new NoSuchElementException();
558    }
559
560    /**
561     * Gets the index of the previous token.
562     *
563     * @return the previous token index
564     */
565    @Override
566    public int previousIndex() {
567        return tokenPos - 1;
568    }
569
570    /**
571     * Unsupported ListIterator operation.
572     *
573     * @throws UnsupportedOperationException always
574     */
575    @Override
576    public void remove() {
577        throw new UnsupportedOperationException("remove() is unsupported");
578    }
579
580    /**
581     * Unsupported ListIterator operation.
582     * @param obj this parameter ignored.
583     * @throws UnsupportedOperationException always
584     */
585    @Override
586    public void set(final String obj) {
587        throw new UnsupportedOperationException("set() is unsupported");
588    }
589
590    /**
591     * Unsupported ListIterator operation.
592     * @param obj this parameter ignored.
593     * @throws UnsupportedOperationException always
594     */
595    @Override
596    public void add(final String obj) {
597        throw new UnsupportedOperationException("add() is unsupported");
598    }
599
600    // Implementation
601    //-----------------------------------------------------------------------
602    /**
603     * Checks if tokenization has been done, and if not then do it.
604     */
605    private void checkTokenized() {
606        if (tokens == null) {
607            if (chars == null) {
608                // still call tokenize as subclass may do some work
609                final List<String> split = tokenize(null, 0, 0);
610                tokens = split.toArray(new String[split.size()]);
611            } else {
612                final List<String> split = tokenize(chars, 0, chars.length);
613                tokens = split.toArray(new String[split.size()]);
614            }
615        }
616    }
617
618    /**
619     * Internal method to performs the tokenization.
620     * <p>
621     * Most users of this class do not need to call this method. This method
622     * will be called automatically by other (public) methods when required.
623     * <p>
624     * This method exists to allow subclasses to add code before or after the
625     * tokenization. For example, a subclass could alter the character array,
626     * offset or count to be parsed, or call the tokenizer multiple times on
627     * multiple strings. It is also be possible to filter the results.
628     * <p>
629     * <code>StrTokenizer</code> will always pass a zero offset and a count
630     * equal to the length of the array to this method, however a subclass
631     * may pass other values, or even an entirely different array.
632     *
633     * @param srcChars  the character array being tokenized, may be null
634     * @param offset  the start position within the character array, must be valid
635     * @param count  the number of characters to tokenize, must be valid
636     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
637     */
638    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
639        if (srcChars == null || count == 0) {
640            return Collections.emptyList();
641        }
642        final StrBuilder buf = new StrBuilder();
643        final List<String> tokenList = new ArrayList<>();
644        int pos = offset;
645
646        // loop around the entire buffer
647        while (pos >= 0 && pos < count) {
648            // find next token
649            pos = readNextToken(srcChars, pos, count, buf, tokenList);
650
651            // handle case where end of string is a delimiter
652            if (pos >= count) {
653                addToken(tokenList, StringUtils.EMPTY);
654            }
655        }
656        return tokenList;
657    }
658
659    /**
660     * Adds a token to a list, paying attention to the parameters we've set.
661     *
662     * @param list  the list to add to
663     * @param tok  the token to add
664     */
665    private void addToken(final List<String> list, String tok) {
666        if (StringUtils.isEmpty(tok)) {
667            if (isIgnoreEmptyTokens()) {
668                return;
669            }
670            if (isEmptyTokenAsNull()) {
671                tok = null;
672            }
673        }
674        list.add(tok);
675    }
676
677    /**
678     * Reads character by character through the String to get the next token.
679     *
680     * @param srcChars  the character array being tokenized
681     * @param start  the first character of field
682     * @param len  the length of the character array being tokenized
683     * @param workArea  a temporary work area
684     * @param tokenList  the list of parsed tokens
685     * @return the starting position of the next field (the character
686     *  immediately after the delimiter), or -1 if end of string found
687     */
688    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
689        // skip all leading whitespace, unless it is the
690        // field delimiter or the quote character
691        while (start < len) {
692            final int removeLen = Math.max(
693                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
694                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
695            if (removeLen == 0 ||
696                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
697                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
698                break;
699            }
700            start += removeLen;
701        }
702
703        // handle reaching end
704        if (start >= len) {
705            addToken(tokenList, StringUtils.EMPTY);
706            return -1;
707        }
708
709        // handle empty token
710        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
711        if (delimLen > 0) {
712            addToken(tokenList, StringUtils.EMPTY);
713            return start + delimLen;
714        }
715
716        // handle found token
717        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
718        if (quoteLen > 0) {
719            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
720        }
721        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
722    }
723
724    /**
725     * Reads a possibly quoted string token.
726     *
727     * @param srcChars  the character array being tokenized
728     * @param start  the first character of field
729     * @param len  the length of the character array being tokenized
730     * @param workArea  a temporary work area
731     * @param tokenList  the list of parsed tokens
732     * @param quoteStart  the start position of the matched quote, 0 if no quoting
733     * @param quoteLen  the length of the matched quote, 0 if no quoting
734     * @return the starting position of the next field (the character
735     *  immediately after the delimiter, or if end of string found,
736     *  then the length of string
737     */
738    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
739                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
740        // Loop until we've found the end of the quoted
741        // string or the end of the input
742        workArea.clear();
743        int pos = start;
744        boolean quoting = quoteLen > 0;
745        int trimStart = 0;
746
747        while (pos < len) {
748            // quoting mode can occur several times throughout a string
749            // we must switch between quoting and non-quoting until we
750            // encounter a non-quoted delimiter, or end of string
751            if (quoting) {
752                // In quoting mode
753
754                // If we've found a quote character, see if it's
755                // followed by a second quote.  If so, then we need
756                // to actually put the quote character into the token
757                // rather than end the token.
758                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
759                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
760                        // matched pair of quotes, thus an escaped quote
761                        workArea.append(srcChars, pos, quoteLen);
762                        pos += quoteLen * 2;
763                        trimStart = workArea.size();
764                        continue;
765                    }
766
767                    // end of quoting
768                    quoting = false;
769                    pos += quoteLen;
770                    continue;
771                }
772
773                // copy regular character from inside quotes
774                workArea.append(srcChars[pos++]);
775                trimStart = workArea.size();
776
777            } else {
778                // Not in quoting mode
779
780                // check for delimiter, and thus end of token
781                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
782                if (delimLen > 0) {
783                    // return condition when end of token found
784                    addToken(tokenList, workArea.substring(0, trimStart));
785                    return pos + delimLen;
786                }
787
788                // check for quote, and thus back into quoting mode
789                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
790                    quoting = true;
791                    pos += quoteLen;
792                    continue;
793                }
794
795                // check for ignored (outside quotes), and ignore
796                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
797                if (ignoredLen > 0) {
798                    pos += ignoredLen;
799                    continue;
800                }
801
802                // check for trimmed character
803                // don't yet know if its at the end, so copy to workArea
804                // use trimStart to keep track of trim at the end
805                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
806                if (trimmedLen > 0) {
807                    workArea.append(srcChars, pos, trimmedLen);
808                    pos += trimmedLen;
809                    continue;
810                }
811
812                // copy regular character from outside quotes
813                workArea.append(srcChars[pos++]);
814                trimStart = workArea.size();
815            }
816        }
817
818        // return condition when end of string found
819        addToken(tokenList, workArea.substring(0, trimStart));
820        return -1;
821    }
822
823    /**
824     * Checks if the characters at the index specified match the quote
825     * already matched in readNextToken().
826     *
827     * @param srcChars  the character array being tokenized
828     * @param pos  the position to check for a quote
829     * @param len  the length of the character array being tokenized
830     * @param quoteStart  the start position of the matched quote, 0 if no quoting
831     * @param quoteLen  the length of the matched quote, 0 if no quoting
832     * @return true if a quote is matched
833     */
834    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
835        for (int i = 0; i < quoteLen; i++) {
836            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
837                return false;
838            }
839        }
840        return true;
841    }
842
843    // Delimiter
844    //-----------------------------------------------------------------------
845    /**
846     * Gets the field delimiter matcher.
847     *
848     * @return the delimiter matcher in use
849     */
850    public StrMatcher getDelimiterMatcher() {
851        return this.delimMatcher;
852    }
853
854    /**
855     * Sets the field delimiter matcher.
856     * <p>
857     * The delimitier is used to separate one token from another.
858     *
859     * @param delim  the delimiter matcher to use
860     * @return this, to enable chaining
861     */
862    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
863        if (delim == null) {
864            this.delimMatcher = StrMatcher.noneMatcher();
865        } else {
866            this.delimMatcher = delim;
867        }
868        return this;
869    }
870
871    /**
872     * Sets the field delimiter character.
873     *
874     * @param delim  the delimiter character to use
875     * @return this, to enable chaining
876     */
877    public StrTokenizer setDelimiterChar(final char delim) {
878        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
879    }
880
881    /**
882     * Sets the field delimiter string.
883     *
884     * @param delim  the delimiter string to use
885     * @return this, to enable chaining
886     */
887    public StrTokenizer setDelimiterString(final String delim) {
888        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
889    }
890
891    // Quote
892    //-----------------------------------------------------------------------
893    /**
894     * Gets the quote matcher currently in use.
895     * <p>
896     * The quote character is used to wrap data between the tokens.
897     * This enables delimiters to be entered as data.
898     * The default value is '"' (double quote).
899     *
900     * @return the quote matcher in use
901     */
902    public StrMatcher getQuoteMatcher() {
903        return quoteMatcher;
904    }
905
906    /**
907     * Set the quote matcher to use.
908     * <p>
909     * The quote character is used to wrap data between the tokens.
910     * This enables delimiters to be entered as data.
911     *
912     * @param quote  the quote matcher to use, null ignored
913     * @return this, to enable chaining
914     */
915    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
916        if (quote != null) {
917            this.quoteMatcher = quote;
918        }
919        return this;
920    }
921
922    /**
923     * Sets the quote character to use.
924     * <p>
925     * The quote character is used to wrap data between the tokens.
926     * This enables delimiters to be entered as data.
927     *
928     * @param quote  the quote character to use
929     * @return this, to enable chaining
930     */
931    public StrTokenizer setQuoteChar(final char quote) {
932        return setQuoteMatcher(StrMatcher.charMatcher(quote));
933    }
934
935    // Ignored
936    //-----------------------------------------------------------------------
937    /**
938     * Gets the ignored character matcher.
939     * <p>
940     * These characters are ignored when parsing the String, unless they are
941     * within a quoted region.
942     * The default value is not to ignore anything.
943     *
944     * @return the ignored matcher in use
945     */
946    public StrMatcher getIgnoredMatcher() {
947        return ignoredMatcher;
948    }
949
950    /**
951     * Set the matcher for characters to ignore.
952     * <p>
953     * These characters are ignored when parsing the String, unless they are
954     * within a quoted region.
955     *
956     * @param ignored  the ignored matcher to use, null ignored
957     * @return this, to enable chaining
958     */
959    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
960        if (ignored != null) {
961            this.ignoredMatcher = ignored;
962        }
963        return this;
964    }
965
966    /**
967     * Set the character to ignore.
968     * <p>
969     * This character is ignored when parsing the String, unless it is
970     * within a quoted region.
971     *
972     * @param ignored  the ignored character to use
973     * @return this, to enable chaining
974     */
975    public StrTokenizer setIgnoredChar(final char ignored) {
976        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
977    }
978
979    // Trimmer
980    //-----------------------------------------------------------------------
981    /**
982     * Gets the trimmer character matcher.
983     * <p>
984     * These characters are trimmed off on each side of the delimiter
985     * until the token or quote is found.
986     * The default value is not to trim anything.
987     *
988     * @return the trimmer matcher in use
989     */
990    public StrMatcher getTrimmerMatcher() {
991        return trimmerMatcher;
992    }
993
994    /**
995     * Sets the matcher for characters to trim.
996     * <p>
997     * These characters are trimmed off on each side of the delimiter
998     * until the token or quote is found.
999     *
1000     * @param trimmer  the trimmer matcher to use, null ignored
1001     * @return this, to enable chaining
1002     */
1003    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1004        if (trimmer != null) {
1005            this.trimmerMatcher = trimmer;
1006        }
1007        return this;
1008    }
1009
1010    //-----------------------------------------------------------------------
1011    /**
1012     * Gets whether the tokenizer currently returns empty tokens as null.
1013     * The default for this property is false.
1014     *
1015     * @return true if empty tokens are returned as null
1016     */
1017    public boolean isEmptyTokenAsNull() {
1018        return this.emptyAsNull;
1019    }
1020
1021    /**
1022     * Sets whether the tokenizer should return empty tokens as null.
1023     * The default for this property is false.
1024     *
1025     * @param emptyAsNull  whether empty tokens are returned as null
1026     * @return this, to enable chaining
1027     */
1028    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1029        this.emptyAsNull = emptyAsNull;
1030        return this;
1031    }
1032
1033    //-----------------------------------------------------------------------
1034    /**
1035     * Gets whether the tokenizer currently ignores empty tokens.
1036     * The default for this property is true.
1037     *
1038     * @return true if empty tokens are not returned
1039     */
1040    public boolean isIgnoreEmptyTokens() {
1041        return ignoreEmptyTokens;
1042    }
1043
1044    /**
1045     * Sets whether the tokenizer should ignore and not return empty tokens.
1046     * The default for this property is true.
1047     *
1048     * @param ignoreEmptyTokens  whether empty tokens are not returned
1049     * @return this, to enable chaining
1050     */
1051    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1052        this.ignoreEmptyTokens = ignoreEmptyTokens;
1053        return this;
1054    }
1055
1056    //-----------------------------------------------------------------------
1057    /**
1058     * Gets the String content that the tokenizer is parsing.
1059     *
1060     * @return the string content being parsed
1061     */
1062    public String getContent() {
1063        if (chars == null) {
1064            return null;
1065        }
1066        return new String(chars);
1067    }
1068
1069    //-----------------------------------------------------------------------
1070    /**
1071     * Creates a new instance of this Tokenizer. The new instance is reset so
1072     * that it will be at the start of the token list.
1073     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1074     *
1075     * @return a new instance of this Tokenizer which has been reset.
1076     */
1077    @Override
1078    public Object clone() {
1079        try {
1080            return cloneReset();
1081        } catch (final CloneNotSupportedException ex) {
1082            return null;
1083        }
1084    }
1085
1086    /**
1087     * Creates a new instance of this Tokenizer. The new instance is reset so that
1088     * it will be at the start of the token list.
1089     *
1090     * @return a new instance of this Tokenizer which has been reset.
1091     * @throws CloneNotSupportedException if there is a problem cloning
1092     */
1093    Object cloneReset() throws CloneNotSupportedException {
1094        // this method exists to enable 100% test coverage
1095        final StrTokenizer cloned = (StrTokenizer) super.clone();
1096        if (cloned.chars != null) {
1097            cloned.chars = cloned.chars.clone();
1098        }
1099        cloned.reset();
1100        return cloned;
1101    }
1102
1103    //-----------------------------------------------------------------------
1104    /**
1105     * Gets the String content that the tokenizer is parsing.
1106     *
1107     * @return the string content being parsed
1108     */
1109    @Override
1110    public String toString() {
1111        if (tokens == null) {
1112            return "StrTokenizer[not tokenized yet]";
1113        }
1114        return "StrTokenizer" + getTokenList();
1115    }
1116
1117}