Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.StringUtils;
026
027/**
028 * Tokenizes a string based on delimiters (separators)
029 * and supporting quoting and ignored character concepts.
030 * <p>
031 * This class can split a String into many smaller strings. It aims
032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
033 * however it offers much more control and flexibility including implementing
034 * the {@code ListIterator} interface. By default, it is set up
035 * like {@code StringTokenizer}.
036 * <p>
037 * The input String is split into a number of <i>tokens</i>.
038 * Each token is separated from the next String by a <i>delimiter</i>.
039 * One or more delimiter characters must be specified.
040 * <p>
041 * Each token may be surrounded by quotes.
042 * The <i>quote</i> matcher specifies the quote character(s).
043 * A quote may be escaped within a quoted section by duplicating itself.
044 * <p>
045 * Between each token and the delimiter are potentially characters that need trimming.
046 * The <i>trimmer</i> matcher specifies these characters.
047 * One usage might be to trim whitespace characters.
048 * <p>
049 * At any point outside the quotes there might potentially be invalid characters.
050 * The <i>ignored</i> matcher specifies these characters to be removed.
051 * One usage might be to remove new line characters.
052 * <p>
053 * Empty tokens may be removed or returned as null.
054 * <pre>
055 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
056 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
058 * </pre>
059 *
060 * <table>
061 *  <caption>StrTokenizer properties and options</caption>
062 *  <tr>
063 *   <th>Property</th><th>Type</th><th>Default</th>
064 *  </tr>
065 *  <tr>
066 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
067 *  </tr>
068 *  <tr>
069 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
070 *  </tr>
071 *  <tr>
072 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
073 *  </tr>
074 *  <tr>
075 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
076 *  </tr>
077 *  <tr>
078 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
079 *  </tr>
080 * </table>
081 *
082 * @since 1.0
083 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
084 */
085@Deprecated
086public class StrTokenizer implements ListIterator<String>, Cloneable {
087
088    /** Comma separated values tokenizer internal variable. */
089    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
090    /** Tab separated values tokenizer internal variable. */
091    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
092    static {
093        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
094        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
095        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
096        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
097        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
098        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
099        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100
101        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108    }
109
110    /** The text to work on. */
111    private char[] chars;
112    /** The parsed tokens. */
113    private String[] tokens;
114    /** The current iteration position. */
115    private int tokenPos;
116
117    /** The delimiter matcher. */
118    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119    /** The quote matcher. */
120    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121    /** The ignored matcher. */
122    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123    /** The trimmer matcher. */
124    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125
126    /** Whether to return empty tokens as null. */
127    private boolean emptyAsNull = false;
128    /** Whether to ignore empty tokens. */
129    private boolean ignoreEmptyTokens = true;
130
131    //-----------------------------------------------------------------------
132
133    /**
134     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
135     *
136     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
137     */
138    private static StrTokenizer getCSVClone() {
139        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140    }
141
142    /**
143     * Gets a new tokenizer instance which parses Comma Separated Value strings
144     * initializing it with the given input.  The default for CSV processing
145     * will be trim whitespace from both ends (which can be overridden with
146     * the setTrimmer method).
147     * <p>
148     * You must call a "reset" method to set the string which you want to parse.
149     * @return a new tokenizer instance which parses Comma Separated Value strings
150     */
151    public static StrTokenizer getCSVInstance() {
152        return getCSVClone();
153    }
154
155    /**
156     * Gets a new tokenizer instance which parses Comma Separated Value strings
157     * initializing it with the given input.  The default for CSV processing
158     * will be trim whitespace from both ends (which can be overridden with
159     * the setTrimmer method).
160     *
161     * @param input  the text to parse
162     * @return a new tokenizer instance which parses Comma Separated Value strings
163     */
164    public static StrTokenizer getCSVInstance(final String input) {
165        final StrTokenizer tok = getCSVClone();
166        tok.reset(input);
167        return tok;
168    }
169
170    /**
171     * Gets a new tokenizer instance which parses Comma Separated Value strings
172     * initializing it with the given input.  The default for CSV processing
173     * will be trim whitespace from both ends (which can be overridden with
174     * the setTrimmer method).
175     *
176     * @param input  the text to parse
177     * @return a new tokenizer instance which parses Comma Separated Value strings
178     */
179    public static StrTokenizer getCSVInstance(final char[] input) {
180        final StrTokenizer tok = getCSVClone();
181        tok.reset(input);
182        return tok;
183    }
184
185    /**
186     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
187     *
188     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
189     */
190    private static StrTokenizer getTSVClone() {
191        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192    }
193
194
195    /**
196     * Gets a new tokenizer instance which parses Tab Separated Value strings.
197     * The default for CSV processing will be trim whitespace from both ends
198     * (which can be overridden with the setTrimmer method).
199     * <p>
200     * You must call a "reset" method to set the string which you want to parse.
201     * @return a new tokenizer instance which parses Tab Separated Value strings.
202     */
203    public static StrTokenizer getTSVInstance() {
204        return getTSVClone();
205    }
206
207    /**
208     * Gets a new tokenizer instance which parses Tab Separated Value strings.
209     * The default for CSV processing will be trim whitespace from both ends
210     * (which can be overridden with the setTrimmer method).
211     * @param input  the string to parse
212     * @return a new tokenizer instance which parses Tab Separated Value strings.
213     */
214    public static StrTokenizer getTSVInstance(final String input) {
215        final StrTokenizer tok = getTSVClone();
216        tok.reset(input);
217        return tok;
218    }
219
220    /**
221     * Gets a new tokenizer instance which parses Tab Separated Value strings.
222     * The default for CSV processing will be trim whitespace from both ends
223     * (which can be overridden with the setTrimmer method).
224     * @param input  the string to parse
225     * @return a new tokenizer instance which parses Tab Separated Value strings.
226     */
227    public static StrTokenizer getTSVInstance(final char[] input) {
228        final StrTokenizer tok = getTSVClone();
229        tok.reset(input);
230        return tok;
231    }
232
233    //-----------------------------------------------------------------------
234    /**
235     * Constructs a tokenizer splitting on space, tab, newline and form feed
236     * as per StringTokenizer, but with no text to tokenize.
237     * <p>
238     * This constructor is normally used with {@link #reset(String)}.
239     */
240    public StrTokenizer() {
241        super();
242        this.chars = null;
243    }
244
245    /**
246     * Constructs a tokenizer splitting on space, tab, newline and form feed
247     * as per StringTokenizer.
248     *
249     * @param input  the string which is to be parsed
250     */
251    public StrTokenizer(final String input) {
252        super();
253        if (input != null) {
254            chars = input.toCharArray();
255        } else {
256            chars = null;
257        }
258    }
259
260    /**
261     * Constructs a tokenizer splitting on the specified delimiter character.
262     *
263     * @param input  the string which is to be parsed
264     * @param delim  the field delimiter character
265     */
266    public StrTokenizer(final String input, final char delim) {
267        this(input);
268        setDelimiterChar(delim);
269    }
270
271    /**
272     * Constructs a tokenizer splitting on the specified delimiter string.
273     *
274     * @param input  the string which is to be parsed
275     * @param delim  the field delimiter string
276     */
277    public StrTokenizer(final String input, final String delim) {
278        this(input);
279        setDelimiterString(delim);
280    }
281
282    /**
283     * Constructs a tokenizer splitting using the specified delimiter matcher.
284     *
285     * @param input  the string which is to be parsed
286     * @param delim  the field delimiter matcher
287     */
288    public StrTokenizer(final String input, final StrMatcher delim) {
289        this(input);
290        setDelimiterMatcher(delim);
291    }
292
293    /**
294     * Constructs a tokenizer splitting on the specified delimiter character
295     * and handling quotes using the specified quote character.
296     *
297     * @param input  the string which is to be parsed
298     * @param delim  the field delimiter character
299     * @param quote  the field quoted string character
300     */
301    public StrTokenizer(final String input, final char delim, final char quote) {
302        this(input, delim);
303        setQuoteChar(quote);
304    }
305
306    /**
307     * Constructs a tokenizer splitting using the specified delimiter matcher
308     * and handling quotes using the specified quote matcher.
309     *
310     * @param input  the string which is to be parsed
311     * @param delim  the field delimiter matcher
312     * @param quote  the field quoted string matcher
313     */
314    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
315        this(input, delim);
316        setQuoteMatcher(quote);
317    }
318
319    /**
320     * Constructs a tokenizer splitting on space, tab, newline and form feed
321     * as per StringTokenizer.
322     *
323     * @param input  the string which is to be parsed, not cloned
324     */
325    public StrTokenizer(final char[] input) {
326        super();
327        if (input == null) {
328            this.chars = null;
329        } else {
330            this.chars = input.clone();
331        }
332    }
333
334    /**
335     * Constructs a tokenizer splitting on the specified character.
336     *
337     * @param input  the string which is to be parsed, not cloned
338     * @param delim the field delimiter character
339     */
340    public StrTokenizer(final char[] input, final char delim) {
341        this(input);
342        setDelimiterChar(delim);
343    }
344
345    /**
346     * Constructs a tokenizer splitting on the specified string.
347     *
348     * @param input  the string which is to be parsed, not cloned
349     * @param delim the field delimiter string
350     */
351    public StrTokenizer(final char[] input, final String delim) {
352        this(input);
353        setDelimiterString(delim);
354    }
355
356    /**
357     * Constructs a tokenizer splitting using the specified delimiter matcher.
358     *
359     * @param input  the string which is to be parsed, not cloned
360     * @param delim  the field delimiter matcher
361     */
362    public StrTokenizer(final char[] input, final StrMatcher delim) {
363        this(input);
364        setDelimiterMatcher(delim);
365    }
366
367    /**
368     * Constructs a tokenizer splitting on the specified delimiter character
369     * and handling quotes using the specified quote character.
370     *
371     * @param input  the string which is to be parsed, not cloned
372     * @param delim  the field delimiter character
373     * @param quote  the field quoted string character
374     */
375    public StrTokenizer(final char[] input, final char delim, final char quote) {
376        this(input, delim);
377        setQuoteChar(quote);
378    }
379
380    /**
381     * Constructs a tokenizer splitting using the specified delimiter matcher
382     * and handling quotes using the specified quote matcher.
383     *
384     * @param input  the string which is to be parsed, not cloned
385     * @param delim  the field delimiter character
386     * @param quote  the field quoted string character
387     */
388    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
389        this(input, delim);
390        setQuoteMatcher(quote);
391    }
392
393    // API
394    //-----------------------------------------------------------------------
395    /**
396     * Gets the number of tokens found in the String.
397     *
398     * @return The number of matched tokens
399     */
400    public int size() {
401        checkTokenized();
402        return tokens.length;
403    }
404
405    /**
406     * Gets the next token from the String.
407     * Equivalent to {@link #next()} except it returns null rather than
408     * throwing {@link NoSuchElementException} when no tokens remain.
409     *
410     * @return The next sequential token, or null when no more tokens are found
411     */
412    public String nextToken() {
413        if (hasNext()) {
414            return tokens[tokenPos++];
415        }
416        return null;
417    }
418
419    /**
420     * Gets the previous token from the String.
421     *
422     * @return The previous sequential token, or null when no more tokens are found
423     */
424    public String previousToken() {
425        if (hasPrevious()) {
426            return tokens[--tokenPos];
427        }
428        return null;
429    }
430
431    /**
432     * Gets a copy of the full token list as an independent modifiable array.
433     *
434     * @return The tokens as a String array
435     */
436    public String[] getTokenArray() {
437        checkTokenized();
438        return tokens.clone();
439    }
440
441    /**
442     * Gets a copy of the full token list as an independent modifiable list.
443     *
444     * @return The tokens as a String array
445     */
446    public List<String> getTokenList() {
447        checkTokenized();
448        final List<String> list = new ArrayList<>(tokens.length);
449        Collections.addAll(list, tokens);
450
451        return list;
452    }
453
454    /**
455     * Resets this tokenizer, forgetting all parsing and iteration already completed.
456     * <p>
457     * This method allows the same tokenizer to be reused for the same String.
458     *
459     * @return this, to enable chaining
460     */
461    public StrTokenizer reset() {
462        tokenPos = 0;
463        tokens = null;
464        return this;
465    }
466
467    /**
468     * Reset this tokenizer, giving it a new input string to parse.
469     * In this manner you can re-use a tokenizer with the same settings
470     * on multiple input lines.
471     *
472     * @param input  the new string to tokenize, null sets no text to parse
473     * @return this, to enable chaining
474     */
475    public StrTokenizer reset(final String input) {
476        reset();
477        if (input != null) {
478            this.chars = input.toCharArray();
479        } else {
480            this.chars = null;
481        }
482        return this;
483    }
484
485    /**
486     * Reset this tokenizer, giving it a new input string to parse.
487     * In this manner you can re-use a tokenizer with the same settings
488     * on multiple input lines.
489     *
490     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
491     * @return this, to enable chaining
492     */
493    public StrTokenizer reset(final char[] input) {
494        reset();
495        if (input != null) {
496            this.chars = input.clone();
497        } else {
498            this.chars = null;
499        }
500        return this;
501    }
502
503    // ListIterator
504    //-----------------------------------------------------------------------
505    /**
506     * Checks whether there are any more tokens.
507     *
508     * @return true if there are more tokens
509     */
510    @Override
511    public boolean hasNext() {
512        checkTokenized();
513        return tokenPos < tokens.length;
514    }
515
516    /**
517     * Gets the next token.
518     *
519     * @return The next String token
520     * @throws NoSuchElementException if there are no more elements
521     */
522    @Override
523    public String next() {
524        if (hasNext()) {
525            return tokens[tokenPos++];
526        }
527        throw new NoSuchElementException();
528    }
529
530    /**
531     * Gets the index of the next token to return.
532     *
533     * @return The next token index
534     */
535    @Override
536    public int nextIndex() {
537        return tokenPos;
538    }
539
540    /**
541     * Checks whether there are any previous tokens that can be iterated to.
542     *
543     * @return true if there are previous tokens
544     */
545    @Override
546    public boolean hasPrevious() {
547        checkTokenized();
548        return tokenPos > 0;
549    }
550
551    /**
552     * Gets the token previous to the last returned token.
553     *
554     * @return The previous token
555     */
556    @Override
557    public String previous() {
558        if (hasPrevious()) {
559            return tokens[--tokenPos];
560        }
561        throw new NoSuchElementException();
562    }
563
564    /**
565     * Gets the index of the previous token.
566     *
567     * @return The previous token index
568     */
569    @Override
570    public int previousIndex() {
571        return tokenPos - 1;
572    }
573
574    /**
575     * Unsupported ListIterator operation.
576     *
577     * @throws UnsupportedOperationException always
578     */
579    @Override
580    public void remove() {
581        throw new UnsupportedOperationException("remove() is unsupported");
582    }
583
584    /**
585     * Unsupported ListIterator operation.
586     * @param obj this parameter ignored.
587     * @throws UnsupportedOperationException always
588     */
589    @Override
590    public void set(final String obj) {
591        throw new UnsupportedOperationException("set() is unsupported");
592    }
593
594    /**
595     * Unsupported ListIterator operation.
596     * @param obj this parameter ignored.
597     * @throws UnsupportedOperationException always
598     */
599    @Override
600    public void add(final String obj) {
601        throw new UnsupportedOperationException("add() is unsupported");
602    }
603
604    // Implementation
605    //-----------------------------------------------------------------------
606    /**
607     * Checks if tokenization has been done, and if not then do it.
608     */
609    private void checkTokenized() {
610        if (tokens == null) {
611            if (chars == null) {
612                // still call tokenize as subclass may do some work
613                final List<String> split = tokenize(null, 0, 0);
614                tokens = split.toArray(new String[split.size()]);
615            } else {
616                final List<String> split = tokenize(chars, 0, chars.length);
617                tokens = split.toArray(new String[split.size()]);
618            }
619        }
620    }
621
622    /**
623     * Internal method to performs the tokenization.
624     * <p>
625     * Most users of this class do not need to call this method. This method
626     * will be called automatically by other (public) methods when required.
627     * <p>
628     * This method exists to allow subclasses to add code before or after the
629     * tokenization. For example, a subclass could alter the character array,
630     * offset or count to be parsed, or call the tokenizer multiple times on
631     * multiple strings. It is also be possible to filter the results.
632     * <p>
633     * {@code StrTokenizer} will always pass a zero offset and a count
634     * equal to the length of the array to this method, however a subclass
635     * may pass other values, or even an entirely different array.
636     *
637     * @param srcChars  the character array being tokenized, may be null
638     * @param offset  the start position within the character array, must be valid
639     * @param count  the number of characters to tokenize, must be valid
640     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
641     */
642    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
643        if (srcChars == null || count == 0) {
644            return Collections.emptyList();
645        }
646        final StrBuilder buf = new StrBuilder();
647        final List<String> tokenList = new ArrayList<>();
648        int pos = offset;
649
650        // loop around the entire buffer
651        while (pos >= 0 && pos < count) {
652            // find next token
653            pos = readNextToken(srcChars, pos, count, buf, tokenList);
654
655            // handle case where end of string is a delimiter
656            if (pos >= count) {
657                addToken(tokenList, StringUtils.EMPTY);
658            }
659        }
660        return tokenList;
661    }
662
663    /**
664     * Adds a token to a list, paying attention to the parameters we've set.
665     *
666     * @param list  the list to add to
667     * @param tok  the token to add
668     */
669    private void addToken(final List<String> list, String tok) {
670        if (tok == null || tok.length() == 0) {
671            if (isIgnoreEmptyTokens()) {
672                return;
673            }
674            if (isEmptyTokenAsNull()) {
675                tok = null;
676            }
677        }
678        list.add(tok);
679    }
680
681    /**
682     * Reads character by character through the String to get the next token.
683     *
684     * @param srcChars  the character array being tokenized
685     * @param start  the first character of field
686     * @param len  the length of the character array being tokenized
687     * @param workArea  a temporary work area
688     * @param tokenList  the list of parsed tokens
689     * @return The starting position of the next field (the character
690     *  immediately after the delimiter), or -1 if end of string found
691     */
692    private int readNextToken(final char[] srcChars,
693                              int start,
694                              final int len,
695                              final StrBuilder workArea,
696                              final List<String> tokenList) {
697        // skip all leading whitespace, unless it is the
698        // field delimiter or the quote character
699        while (start < len) {
700            final int removeLen = Math.max(
701                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
702                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
703            if (removeLen == 0
704                    || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
705                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
706                break;
707            }
708            start += removeLen;
709        }
710
711        // handle reaching end
712        if (start >= len) {
713            addToken(tokenList, StringUtils.EMPTY);
714            return -1;
715        }
716
717        // handle empty token
718        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
719        if (delimLen > 0) {
720            addToken(tokenList, StringUtils.EMPTY);
721            return start + delimLen;
722        }
723
724        // handle found token
725        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
726        if (quoteLen > 0) {
727            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
728        }
729        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
730    }
731
732    /**
733     * Reads a possibly quoted string token.
734     *
735     * @param srcChars  the character array being tokenized
736     * @param start  the first character of field
737     * @param len  the length of the character array being tokenized
738     * @param workArea  a temporary work area
739     * @param tokenList  the list of parsed tokens
740     * @param quoteStart  the start position of the matched quote, 0 if no quoting
741     * @param quoteLen  the length of the matched quote, 0 if no quoting
742     * @return The starting position of the next field (the character
743     *  immediately after the delimiter, or if end of string found,
744     *  then the length of string
745     */
746    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
747                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
748        // Loop until we've found the end of the quoted
749        // string or the end of the input
750        workArea.clear();
751        int pos = start;
752        boolean quoting = quoteLen > 0;
753        int trimStart = 0;
754
755        while (pos < len) {
756            // quoting mode can occur several times throughout a string
757            // we must switch between quoting and non-quoting until we
758            // encounter a non-quoted delimiter, or end of string
759            if (quoting) {
760                // In quoting mode
761
762                // If we've found a quote character, see if it's
763                // followed by a second quote.  If so, then we need
764                // to actually put the quote character into the token
765                // rather than end the token.
766                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
767                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
768                        // matched pair of quotes, thus an escaped quote
769                        workArea.append(srcChars, pos, quoteLen);
770                        pos += quoteLen * 2;
771                        trimStart = workArea.size();
772                        continue;
773                    }
774
775                    // end of quoting
776                    quoting = false;
777                    pos += quoteLen;
778                    continue;
779                }
780
781                // copy regular character from inside quotes
782                workArea.append(srcChars[pos++]);
783                trimStart = workArea.size();
784
785            } else {
786                // Not in quoting mode
787
788                // check for delimiter, and thus end of token
789                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
790                if (delimLen > 0) {
791                    // return condition when end of token found
792                    addToken(tokenList, workArea.substring(0, trimStart));
793                    return pos + delimLen;
794                }
795
796                // check for quote, and thus back into quoting mode
797                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
798                    quoting = true;
799                    pos += quoteLen;
800                    continue;
801                }
802
803                // check for ignored (outside quotes), and ignore
804                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
805                if (ignoredLen > 0) {
806                    pos += ignoredLen;
807                    continue;
808                }
809
810                // check for trimmed character
811                // don't yet know if its at the end, so copy to workArea
812                // use trimStart to keep track of trim at the end
813                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
814                if (trimmedLen > 0) {
815                    workArea.append(srcChars, pos, trimmedLen);
816                    pos += trimmedLen;
817                    continue;
818                }
819
820                // copy regular character from outside quotes
821                workArea.append(srcChars[pos++]);
822                trimStart = workArea.size();
823            }
824        }
825
826        // return condition when end of string found
827        addToken(tokenList, workArea.substring(0, trimStart));
828        return -1;
829    }
830
831    /**
832     * Checks if the characters at the index specified match the quote
833     * already matched in readNextToken().
834     *
835     * @param srcChars  the character array being tokenized
836     * @param pos  the position to check for a quote
837     * @param len  the length of the character array being tokenized
838     * @param quoteStart  the start position of the matched quote, 0 if no quoting
839     * @param quoteLen  the length of the matched quote, 0 if no quoting
840     * @return true if a quote is matched
841     */
842    private boolean isQuote(final char[] srcChars,
843                            final int pos,
844                            final int len,
845                            final int quoteStart,
846                            final int quoteLen) {
847        for (int i = 0; i < quoteLen; i++) {
848            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
849                return false;
850            }
851        }
852        return true;
853    }
854
855    // Delimiter
856    //-----------------------------------------------------------------------
857    /**
858     * Gets the field delimiter matcher.
859     *
860     * @return The delimiter matcher in use
861     */
862    public StrMatcher getDelimiterMatcher() {
863        return this.delimMatcher;
864    }
865
866    /**
867     * Sets the field delimiter matcher.
868     * <p>
869     * The delimiter is used to separate one token from another.
870     *
871     * @param delim  the delimiter matcher to use
872     * @return this, to enable chaining
873     */
874    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
875        if (delim == null) {
876            this.delimMatcher = StrMatcher.noneMatcher();
877        } else {
878            this.delimMatcher = delim;
879        }
880        return this;
881    }
882
883    /**
884     * Sets the field delimiter character.
885     *
886     * @param delim  the delimiter character to use
887     * @return this, to enable chaining
888     */
889    public StrTokenizer setDelimiterChar(final char delim) {
890        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
891    }
892
893    /**
894     * Sets the field delimiter string.
895     *
896     * @param delim  the delimiter string to use
897     * @return this, to enable chaining
898     */
899    public StrTokenizer setDelimiterString(final String delim) {
900        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
901    }
902
903    // Quote
904    //-----------------------------------------------------------------------
905    /**
906     * Gets the quote matcher currently in use.
907     * <p>
908     * The quote character is used to wrap data between the tokens.
909     * This enables delimiters to be entered as data.
910     * The default value is '"' (double quote).
911     *
912     * @return The quote matcher in use
913     */
914    public StrMatcher getQuoteMatcher() {
915        return quoteMatcher;
916    }
917
918    /**
919     * Set the quote matcher to use.
920     * <p>
921     * The quote character is used to wrap data between the tokens.
922     * This enables delimiters to be entered as data.
923     *
924     * @param quote  the quote matcher to use, null ignored
925     * @return this, to enable chaining
926     */
927    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
928        if (quote != null) {
929            this.quoteMatcher = quote;
930        }
931        return this;
932    }
933
934    /**
935     * Sets the quote character to use.
936     * <p>
937     * The quote character is used to wrap data between the tokens.
938     * This enables delimiters to be entered as data.
939     *
940     * @param quote  the quote character to use
941     * @return this, to enable chaining
942     */
943    public StrTokenizer setQuoteChar(final char quote) {
944        return setQuoteMatcher(StrMatcher.charMatcher(quote));
945    }
946
947    // Ignored
948    //-----------------------------------------------------------------------
949    /**
950     * Gets the ignored character matcher.
951     * <p>
952     * These characters are ignored when parsing the String, unless they are
953     * within a quoted region.
954     * The default value is not to ignore anything.
955     *
956     * @return The ignored matcher in use
957     */
958    public StrMatcher getIgnoredMatcher() {
959        return ignoredMatcher;
960    }
961
962    /**
963     * Set the matcher for characters to ignore.
964     * <p>
965     * These characters are ignored when parsing the String, unless they are
966     * within a quoted region.
967     *
968     * @param ignored  the ignored matcher to use, null ignored
969     * @return this, to enable chaining
970     */
971    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
972        if (ignored != null) {
973            this.ignoredMatcher = ignored;
974        }
975        return this;
976    }
977
978    /**
979     * Set the character to ignore.
980     * <p>
981     * This character is ignored when parsing the String, unless it is
982     * within a quoted region.
983     *
984     * @param ignored  the ignored character to use
985     * @return this, to enable chaining
986     */
987    public StrTokenizer setIgnoredChar(final char ignored) {
988        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
989    }
990
991    // Trimmer
992    //-----------------------------------------------------------------------
993    /**
994     * Gets the trimmer character matcher.
995     * <p>
996     * These characters are trimmed off on each side of the delimiter
997     * until the token or quote is found.
998     * The default value is not to trim anything.
999     *
1000     * @return The trimmer matcher in use
1001     */
1002    public StrMatcher getTrimmerMatcher() {
1003        return trimmerMatcher;
1004    }
1005
1006    /**
1007     * Sets the matcher for characters to trim.
1008     * <p>
1009     * These characters are trimmed off on each side of the delimiter
1010     * until the token or quote is found.
1011     *
1012     * @param trimmer  the trimmer matcher to use, null ignored
1013     * @return this, to enable chaining
1014     */
1015    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1016        if (trimmer != null) {
1017            this.trimmerMatcher = trimmer;
1018        }
1019        return this;
1020    }
1021
1022    //-----------------------------------------------------------------------
1023    /**
1024     * Gets whether the tokenizer currently returns empty tokens as null.
1025     * The default for this property is false.
1026     *
1027     * @return true if empty tokens are returned as null
1028     */
1029    public boolean isEmptyTokenAsNull() {
1030        return this.emptyAsNull;
1031    }
1032
1033    /**
1034     * Sets whether the tokenizer should return empty tokens as null.
1035     * The default for this property is false.
1036     *
1037     * @param emptyAsNull  whether empty tokens are returned as null
1038     * @return this, to enable chaining
1039     */
1040    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1041        this.emptyAsNull = emptyAsNull;
1042        return this;
1043    }
1044
1045    //-----------------------------------------------------------------------
1046    /**
1047     * Gets whether the tokenizer currently ignores empty tokens.
1048     * The default for this property is true.
1049     *
1050     * @return true if empty tokens are not returned
1051     */
1052    public boolean isIgnoreEmptyTokens() {
1053        return ignoreEmptyTokens;
1054    }
1055
1056    /**
1057     * Sets whether the tokenizer should ignore and not return empty tokens.
1058     * The default for this property is true.
1059     *
1060     * @param ignoreEmptyTokens  whether empty tokens are not returned
1061     * @return this, to enable chaining
1062     */
1063    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1064        this.ignoreEmptyTokens = ignoreEmptyTokens;
1065        return this;
1066    }
1067
1068    //-----------------------------------------------------------------------
1069    /**
1070     * Gets the String content that the tokenizer is parsing.
1071     *
1072     * @return The string content being parsed
1073     */
1074    public String getContent() {
1075        if (chars == null) {
1076            return null;
1077        }
1078        return new String(chars);
1079    }
1080
1081    //-----------------------------------------------------------------------
1082    /**
1083     * Creates a new instance of this Tokenizer. The new instance is reset so
1084     * that it will be at the start of the token list.
1085     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
1086     *
1087     * @return a new instance of this Tokenizer which has been reset.
1088     */
1089    @Override
1090    public Object clone() {
1091        try {
1092            return cloneReset();
1093        } catch (final CloneNotSupportedException ex) {
1094            return null;
1095        }
1096    }
1097
1098    /**
1099     * Creates a new instance of this Tokenizer. The new instance is reset so that
1100     * it will be at the start of the token list.
1101     *
1102     * @return a new instance of this Tokenizer which has been reset.
1103     * @throws CloneNotSupportedException if there is a problem cloning
1104     */
1105    Object cloneReset() throws CloneNotSupportedException {
1106        // this method exists to enable 100% test coverage
1107        final StrTokenizer cloned = (StrTokenizer) super.clone();
1108        if (cloned.chars != null) {
1109            cloned.chars = cloned.chars.clone();
1110        }
1111        cloned.reset();
1112        return cloned;
1113    }
1114
1115    //-----------------------------------------------------------------------
1116    /**
1117     * Gets the String content that the tokenizer is parsing.
1118     *
1119     * @return The string content being parsed
1120     */
1121    @Override
1122    public String toString() {
1123        if (tokens == null) {
1124            return "StrTokenizer[not tokenized yet]";
1125        }
1126        return "StrTokenizer" + getTokenList();
1127    }
1128
1129}