StringTokenizer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text;

  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.List;
  22. import java.util.ListIterator;
  23. import java.util.NoSuchElementException;

  24. import org.apache.commons.lang3.ArrayUtils;
  25. import org.apache.commons.lang3.StringUtils;
  26. import org.apache.commons.text.matcher.StringMatcher;
  27. import org.apache.commons.text.matcher.StringMatcherFactory;

  28. /**
  29.  * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
  30.  * <p>
  31.  * This class can split a String into many smaller strings. It aims to do a similar job to
  32.  * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
  33.  * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
  34.  * <p>
  35.  * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
  36.  * <em>delimiter</em>. One or more delimiter characters must be specified.
  37.  * <p>
  38.  * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
  39.  * escaped within a quoted section by duplicating itself.
  40.  * <p>
  41.  * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
  42.  * specifies these characters. One usage might be to trim whitespace characters.
  43.  * <p>
  44.  * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
  45.  * these characters to be removed. One usage might be to remove new line characters.
  46.  * <p>
  47.  * Empty tokens may be removed or returned as null.
  48.  *
  49.  * <pre>
  50.  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
  51.  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
  52.  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
  53.  * </pre>
  54.  *
  55.  * <table>
  56.  * <caption>StringTokenizer properties and options</caption>
  57.  * <tr>
  58.  * <th>Property</th>
  59.  * <th>Type</th>
  60.  * <th>Default</th>
  61.  * </tr>
  62.  * <tr>
  63.  * <td>delim</td>
  64.  * <td>CharSetMatcher</td>
  65.  * <td>{ \t\n\r\f}</td>
  66.  * </tr>
  67.  * <tr>
  68.  * <td>quote</td>
  69.  * <td>NoneMatcher</td>
  70.  * <td>{}</td>
  71.  * </tr>
  72.  * <tr>
  73.  * <td>ignore</td>
  74.  * <td>NoneMatcher</td>
  75.  * <td>{}</td>
  76.  * </tr>
  77.  * <tr>
  78.  * <td>emptyTokenAsNull</td>
  79.  * <td>boolean</td>
  80.  * <td>false</td>
  81.  * </tr>
  82.  * <tr>
  83.  * <td>ignoreEmptyTokens</td>
  84.  * <td>boolean</td>
  85.  * <td>true</td>
  86.  * </tr>
  87.  * </table>
  88.  *
  89.  * @since 1.3
  90.  */
  91. public class StringTokenizer implements ListIterator<String>, Cloneable {

  92.     /** Comma separated values tokenizer internal variable. */
  93.     private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;

  94.     /** Tab separated values tokenizer internal variable. */
  95.     private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;

  96.     static {
  97.         CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
  98.         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
  99.         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
  100.         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
  101.         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
  102.         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  103.         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);

  104.         TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
  105.         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
  106.         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
  107.         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
  108.         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
  109.         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  110.         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
  111.     }

  112.     /**
  113.      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
  114.      *
  115.      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
  116.      */
  117.     private static StringTokenizer getCSVClone() {
  118.         return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
  119.     }

  120.     /**
  121.      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
  122.      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
  123.      * setTrimmer method).
  124.      * <p>
  125.      * You must call a "reset" method to set the string which you want to parse.
  126.      * </p>
  127.      *
  128.      * @return a new tokenizer instance which parses Comma Separated Value strings
  129.      */
  130.     public static StringTokenizer getCSVInstance() {
  131.         return getCSVClone();
  132.     }

  133.     /**
  134.      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
  135.      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
  136.      * setTrimmer method).
  137.      *
  138.      * @param input
  139.      *            the text to parse
  140.      * @return a new tokenizer instance which parses Comma Separated Value strings
  141.      */
  142.     public static StringTokenizer getCSVInstance(final char[] input) {
  143.         return getCSVClone().reset(input);
  144.     }

  145.     /**
  146.      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
  147.      * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
  148.      * setTrimmer method).
  149.      *
  150.      * @param input
  151.      *            the text to parse
  152.      * @return a new tokenizer instance which parses Comma Separated Value strings
  153.      */
  154.     public static StringTokenizer getCSVInstance(final String input) {
  155.         return getCSVClone().reset(input);
  156.     }

  157.     /**
  158.      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
  159.      *
  160.      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
  161.      */
  162.     private static StringTokenizer getTSVClone() {
  163.         return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
  164.     }

  165.     /**
  166.      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
  167.      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
  168.      * <p>
  169.      * You must call a "reset" method to set the string which you want to parse.
  170.      * </p>
  171.      *
  172.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  173.      */
  174.     public static StringTokenizer getTSVInstance() {
  175.         return getTSVClone();
  176.     }

  177.     /**
  178.      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
  179.      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
  180.      *
  181.      * @param input
  182.      *            the string to parse
  183.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  184.      */
  185.     public static StringTokenizer getTSVInstance(final char[] input) {
  186.         return getTSVClone().reset(input);
  187.     }

  188.     /**
  189.      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
  190.      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
  191.      *
  192.      * @param input
  193.      *            the string to parse
  194.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  195.      */
  196.     public static StringTokenizer getTSVInstance(final String input) {
  197.         return getTSVClone().reset(input);
  198.     }

  199.     /** The text to work on. */
  200.     private char[] chars;

  201.     /** The parsed tokens. */
  202.     private String[] tokens;

  203.     /** The current iteration position. */
  204.     private int tokenPos;

  205.     /** The delimiter matcher. */
  206.     private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();

  207.     /** The quote matcher. */
  208.     private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();

  209.     /** The ignored matcher. */
  210.     private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();

  211.     /** The trimmer matcher. */
  212.     private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();

  213.     /** Whether to return empty tokens as null. */
  214.     private boolean emptyAsNull;

  215.     /** Whether to ignore empty tokens. */
  216.     private boolean ignoreEmptyTokens = true;

  217.     /**
  218.      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
  219.      * tokenize.
  220.      * <p>
  221.      * This constructor is normally used with {@link #reset(String)}.
  222.      * </p>
  223.      */
  224.     public StringTokenizer() {
  225.         this.chars = null;
  226.     }

  227.     /**
  228.      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
  229.      *
  230.      * @param input
  231.      *            the string which is to be parsed, not cloned
  232.      */
  233.     public StringTokenizer(final char[] input) {
  234.         this.chars = input != null ? input.clone() : null;
  235.     }

  236.     /**
  237.      * Constructs a tokenizer splitting on the specified character.
  238.      *
  239.      * @param input
  240.      *            the string which is to be parsed, not cloned
  241.      * @param delim
  242.      *            the field delimiter character
  243.      */
  244.     public StringTokenizer(final char[] input, final char delim) {
  245.         this(input);
  246.         setDelimiterChar(delim);
  247.     }

  248.     /**
  249.      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
  250.      * quote character.
  251.      *
  252.      * @param input
  253.      *            the string which is to be parsed, not cloned
  254.      * @param delim
  255.      *            the field delimiter character
  256.      * @param quote
  257.      *            the field quoted string character
  258.      */
  259.     public StringTokenizer(final char[] input, final char delim, final char quote) {
  260.         this(input, delim);
  261.         setQuoteChar(quote);
  262.     }

  263.     /**
  264.      * Constructs a tokenizer splitting on the specified string.
  265.      *
  266.      * @param input
  267.      *            the string which is to be parsed, not cloned
  268.      * @param delim
  269.      *            the field delimiter string
  270.      */
  271.     public StringTokenizer(final char[] input, final String delim) {
  272.         this(input);
  273.         setDelimiterString(delim);
  274.     }

  275.     /**
  276.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  277.      *
  278.      * @param input
  279.      *            the string which is to be parsed, not cloned
  280.      * @param delim
  281.      *            the field delimiter matcher
  282.      */
  283.     public StringTokenizer(final char[] input, final StringMatcher delim) {
  284.         this(input);
  285.         setDelimiterMatcher(delim);
  286.     }

  287.     /**
  288.      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
  289.      * quote matcher.
  290.      *
  291.      * @param input
  292.      *            the string which is to be parsed, not cloned
  293.      * @param delim
  294.      *            the field delimiter character
  295.      * @param quote
  296.      *            the field quoted string character
  297.      */
  298.     public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
  299.         this(input, delim);
  300.         setQuoteMatcher(quote);
  301.     }

  302.     /**
  303.      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
  304.      *
  305.      * @param input
  306.      *            the string which is to be parsed
  307.      */
  308.     public StringTokenizer(final String input) {
  309.         this.chars = input != null ? input.toCharArray() : null;
  310.     }

  311.     /**
  312.      * Constructs a tokenizer splitting on the specified delimiter character.
  313.      *
  314.      * @param input
  315.      *            the string which is to be parsed
  316.      * @param delim
  317.      *            the field delimiter character
  318.      */
  319.     public StringTokenizer(final String input, final char delim) {
  320.         this(input);
  321.         setDelimiterChar(delim);
  322.     }

  323.     /**
  324.      * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
  325.      * quote character.
  326.      *
  327.      * @param input
  328.      *            the string which is to be parsed
  329.      * @param delim
  330.      *            the field delimiter character
  331.      * @param quote
  332.      *            the field quoted string character
  333.      */
  334.     public StringTokenizer(final String input, final char delim, final char quote) {
  335.         this(input, delim);
  336.         setQuoteChar(quote);
  337.     }

  338.     /**
  339.      * Constructs a tokenizer splitting on the specified delimiter string.
  340.      *
  341.      * @param input
  342.      *            the string which is to be parsed
  343.      * @param delim
  344.      *            the field delimiter string
  345.      */
  346.     public StringTokenizer(final String input, final String delim) {
  347.         this(input);
  348.         setDelimiterString(delim);
  349.     }

  350.     /**
  351.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  352.      *
  353.      * @param input
  354.      *            the string which is to be parsed
  355.      * @param delim
  356.      *            the field delimiter matcher
  357.      */
  358.     public StringTokenizer(final String input, final StringMatcher delim) {
  359.         this(input);
  360.         setDelimiterMatcher(delim);
  361.     }

  362.     /**
  363.      * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
  364.      * quote matcher.
  365.      *
  366.      * @param input
  367.      *            the string which is to be parsed
  368.      * @param delim
  369.      *            the field delimiter matcher
  370.      * @param quote
  371.      *            the field quoted string matcher
  372.      */
  373.     public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
  374.         this(input, delim);
  375.         setQuoteMatcher(quote);
  376.     }

  377.     /**
  378.      * Unsupported ListIterator operation.
  379.      *
  380.      * @param obj
  381.      *            this parameter ignored.
  382.      * @throws UnsupportedOperationException
  383.      *             always
  384.      */
  385.     @Override
  386.     public void add(final String obj) {
  387.         throw new UnsupportedOperationException("add() is unsupported");
  388.     }

  389.     /**
  390.      * Adds a token to a list, paying attention to the parameters we've set.
  391.      *
  392.      * @param list
  393.      *            the list to add to
  394.      * @param tok
  395.      *            the token to add
  396.      */
  397.     private void addToken(final List<String> list, String tok) {
  398.         if (tok == null || tok.isEmpty()) {
  399.             if (isIgnoreEmptyTokens()) {
  400.                 return;
  401.             }
  402.             if (isEmptyTokenAsNull()) {
  403.                 tok = null;
  404.             }
  405.         }
  406.         list.add(tok);
  407.     }

  408.     /**
  409.      * Checks if tokenization has been done, and if not then do it.
  410.      */
  411.     private void checkTokenized() {
  412.         if (tokens == null) {
  413.             final List<String> split;
  414.             if (chars == null) {
  415.                 // still call tokenize as subclass may do some work
  416.                 split = tokenize(null, 0, 0);
  417.             } else {
  418.                 split = tokenize(chars, 0, chars.length);
  419.             }
  420.             tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
  421.         }
  422.     }

  423.     /**
  424.      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
  425.      * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
  426.      *
  427.      * @return a new instance of this Tokenizer which has been reset.
  428.      */
  429.     @Override
  430.     public Object clone() {
  431.         try {
  432.             return cloneReset();
  433.         } catch (final CloneNotSupportedException ex) {
  434.             return null;
  435.         }
  436.     }

  437.     /**
  438.      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
  439.      * list.
  440.      *
  441.      * @return a new instance of this Tokenizer which has been reset.
  442.      * @throws CloneNotSupportedException
  443.      *             if there is a problem cloning
  444.      */
  445.     Object cloneReset() throws CloneNotSupportedException {
  446.         // this method exists to enable 100% test coverage
  447.         final StringTokenizer cloned = (StringTokenizer) super.clone();
  448.         if (cloned.chars != null) {
  449.             cloned.chars = cloned.chars.clone();
  450.         }
  451.         cloned.reset();
  452.         return cloned;
  453.     }

  454.     /**
  455.      * Gets the String content that the tokenizer is parsing.
  456.      *
  457.      * @return The string content being parsed
  458.      */
  459.     public String getContent() {
  460.         if (chars == null) {
  461.             return null;
  462.         }
  463.         return new String(chars);
  464.     }

  465.     /**
  466.      * Gets the field delimiter matcher.
  467.      *
  468.      * @return The delimiter matcher in use
  469.      */
  470.     public StringMatcher getDelimiterMatcher() {
  471.         return this.delimMatcher;
  472.     }

  473.     /**
  474.      * Gets the ignored character matcher.
  475.      * <p>
  476.      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
  477.      * is not to ignore anything.
  478.      * </p>
  479.      *
  480.      * @return The ignored matcher in use
  481.      */
  482.     public StringMatcher getIgnoredMatcher() {
  483.         return ignoredMatcher;
  484.     }

  485.     /**
  486.      * Gets the quote matcher currently in use.
  487.      * <p>
  488.      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
  489.      * default value is '"' (double quote).
  490.      * </p>
  491.      *
  492.      * @return The quote matcher in use
  493.      */
  494.     public StringMatcher getQuoteMatcher() {
  495.         return quoteMatcher;
  496.     }

  497.     /**
  498.      * Gets a copy of the full token list as an independent modifiable array.
  499.      *
  500.      * @return The tokens as a String array
  501.      */
  502.     public String[] getTokenArray() {
  503.         checkTokenized();
  504.         return tokens.clone();
  505.     }

  506.     /**
  507.      * Gets a copy of the full token list as an independent modifiable list.
  508.      *
  509.      * @return The tokens as a String list
  510.      */
  511.     public List<String> getTokenList() {
  512.         checkTokenized();
  513.         return new ArrayList<>(Arrays.asList(tokens));
  514.     }

  515.     /**
  516.      * Gets the trimmer character matcher.
  517.      * <p>
  518.      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
  519.      * value is not to trim anything.
  520.      * </p>
  521.      *
  522.      * @return The trimmer matcher in use
  523.      */
  524.     public StringMatcher getTrimmerMatcher() {
  525.         return trimmerMatcher;
  526.     }

  527.     /**
  528.      * Tests whether there are any more tokens.
  529.      *
  530.      * @return true if there are more tokens
  531.      */
  532.     @Override
  533.     public boolean hasNext() {
  534.         checkTokenized();
  535.         return tokenPos < tokens.length;
  536.     }

  537.     /**
  538.      * Tests whether there are any previous tokens that can be iterated to.
  539.      *
  540.      * @return true if there are previous tokens
  541.      */
  542.     @Override
  543.     public boolean hasPrevious() {
  544.         checkTokenized();
  545.         return tokenPos > 0;
  546.     }

  547.     /**
  548.      * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
  549.      *
  550.      * @return true if empty tokens are returned as null
  551.      */
  552.     public boolean isEmptyTokenAsNull() {
  553.         return this.emptyAsNull;
  554.     }

  555.     /**
  556.      * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
  557.      *
  558.      * @return true if empty tokens are not returned
  559.      */
  560.     public boolean isIgnoreEmptyTokens() {
  561.         return ignoreEmptyTokens;
  562.     }

  563.     /**
  564.      * Tests if the characters at the index specified match the quote already matched in readNextToken().
  565.      *
  566.      * @param srcChars
  567.      *            the character array being tokenized
  568.      * @param pos
  569.      *            the position to check for a quote
  570.      * @param len
  571.      *            the length of the character array being tokenized
  572.      * @param quoteStart
  573.      *            the start position of the matched quote, 0 if no quoting
  574.      * @param quoteLen
  575.      *            the length of the matched quote, 0 if no quoting
  576.      * @return true if a quote is matched
  577.      */
  578.     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
  579.             final int quoteLen) {
  580.         for (int i = 0; i < quoteLen; i++) {
  581.             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
  582.                 return false;
  583.             }
  584.         }
  585.         return true;
  586.     }

  587.     /**
  588.      * Gets the next token.
  589.      *
  590.      * @return The next String token
  591.      * @throws NoSuchElementException
  592.      *             if there are no more elements
  593.      */
  594.     @Override
  595.     public String next() {
  596.         if (hasNext()) {
  597.             return tokens[tokenPos++];
  598.         }
  599.         throw new NoSuchElementException();
  600.     }

  601.     /**
  602.      * Gets the index of the next token to return.
  603.      *
  604.      * @return The next token index
  605.      */
  606.     @Override
  607.     public int nextIndex() {
  608.         return tokenPos;
  609.     }

  610.     /**
  611.      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
  612.      * {@link NoSuchElementException} when no tokens remain.
  613.      *
  614.      * @return The next sequential token, or null when no more tokens are found
  615.      */
  616.     public String nextToken() {
  617.         if (hasNext()) {
  618.             return tokens[tokenPos++];
  619.         }
  620.         return null;
  621.     }

  622.     /**
  623.      * Gets the token previous to the last returned token.
  624.      *
  625.      * @return The previous token
  626.      */
  627.     @Override
  628.     public String previous() {
  629.         if (hasPrevious()) {
  630.             return tokens[--tokenPos];
  631.         }
  632.         throw new NoSuchElementException();
  633.     }

  634.     /**
  635.      * Gets the index of the previous token.
  636.      *
  637.      * @return The previous token index
  638.      */
  639.     @Override
  640.     public int previousIndex() {
  641.         return tokenPos - 1;
  642.     }

  643.     /**
  644.      * Gets the previous token from the String.
  645.      *
  646.      * @return The previous sequential token, or null when no more tokens are found
  647.      */
  648.     public String previousToken() {
  649.         if (hasPrevious()) {
  650.             return tokens[--tokenPos];
  651.         }
  652.         return null;
  653.     }

  654.     /**
  655.      * Reads character by character through the String to get the next token.
  656.      *
  657.      * @param srcChars
  658.      *            the character array being tokenized
  659.      * @param start
  660.      *            the first character of field
  661.      * @param len
  662.      *            the length of the character array being tokenized
  663.      * @param workArea
  664.      *            a temporary work area
  665.      * @param tokenList
  666.      *            the list of parsed tokens
  667.      * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
  668.      *         string found
  669.      */
  670.     private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
  671.             final List<String> tokenList) {
  672.         // skip all leading whitespace, unless it is the
  673.         // field delimiter or the quote character
  674.         while (start < len) {
  675.             final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
  676.                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
  677.             if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
  678.                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
  679.                 break;
  680.             }
  681.             start += removeLen;
  682.         }

  683.         // handle reaching end
  684.         if (start >= len) {
  685.             addToken(tokenList, StringUtils.EMPTY);
  686.             return -1;
  687.         }

  688.         // handle empty token
  689.         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
  690.         if (delimLen > 0) {
  691.             addToken(tokenList, StringUtils.EMPTY);
  692.             return start + delimLen;
  693.         }

  694.         // handle found token
  695.         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
  696.         if (quoteLen > 0) {
  697.             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
  698.         }
  699.         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
  700.     }

  701.     /**
  702.      * Reads a possibly quoted string token.
  703.      *
  704.      * @param srcChars
  705.      *            the character array being tokenized
  706.      * @param start
  707.      *            the first character of field
  708.      * @param len
  709.      *            the length of the character array being tokenized
  710.      * @param workArea
  711.      *            a temporary work area
  712.      * @param tokenList
  713.      *            the list of parsed tokens
  714.      * @param quoteStart
  715.      *            the start position of the matched quote, 0 if no quoting
  716.      * @param quoteLen
  717.      *            the length of the matched quote, 0 if no quoting
  718.      * @return The starting position of the next field (the character immediately after the delimiter, or if end of
  719.      *         string found, then the length of string
  720.      */
  721.     private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
  722.             final List<String> tokenList, final int quoteStart, final int quoteLen) {
  723.         // Loop until we've found the end of the quoted
  724.         // string or the end of the input
  725.         workArea.clear();
  726.         int pos = start;
  727.         boolean quoting = quoteLen > 0;
  728.         int trimStart = 0;

  729.         while (pos < len) {
  730.             // quoting mode can occur several times throughout a string
  731.             // we must switch between quoting and non-quoting until we
  732.             // encounter a non-quoted delimiter, or end of string
  733.             if (quoting) {
  734.                 // In quoting mode

  735.                 // If we've found a quote character, see if it's
  736.                 // followed by a second quote. If so, then we need
  737.                 // to actually put the quote character into the token
  738.                 // rather than end the token.
  739.                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  740.                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
  741.                         // matched pair of quotes, thus an escaped quote
  742.                         workArea.append(srcChars, pos, quoteLen);
  743.                         pos += quoteLen * 2;
  744.                         trimStart = workArea.size();
  745.                         continue;
  746.                     }

  747.                     // end of quoting
  748.                     quoting = false;
  749.                     pos += quoteLen;
  750.                     continue;
  751.                 }

  752.             } else {
  753.                 // Not in quoting mode

  754.                 // check for delimiter, and thus end of token
  755.                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
  756.                 if (delimLen > 0) {
  757.                     // return condition when end of token found
  758.                     addToken(tokenList, workArea.substring(0, trimStart));
  759.                     return pos + delimLen;
  760.                 }

  761.                 // check for quote, and thus back into quoting mode
  762.                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  763.                     quoting = true;
  764.                     pos += quoteLen;
  765.                     continue;
  766.                 }

  767.                 // check for ignored (outside quotes), and ignore
  768.                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
  769.                 if (ignoredLen > 0) {
  770.                     pos += ignoredLen;
  771.                     continue;
  772.                 }

  773.                 // check for trimmed character
  774.                 // don't yet know if its at the end, so copy to workArea
  775.                 // use trimStart to keep track of trim at the end
  776.                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
  777.                 if (trimmedLen > 0) {
  778.                     workArea.append(srcChars, pos, trimmedLen);
  779.                     pos += trimmedLen;
  780.                     continue;
  781.                 }
  782.             }
  783.             // copy regular character from inside quotes
  784.             workArea.append(srcChars[pos++]);
  785.             trimStart = workArea.size();
  786.         }

  787.         // return condition when end of string found
  788.         addToken(tokenList, workArea.substring(0, trimStart));
  789.         return -1;
  790.     }

  791.     /**
  792.      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
  793.      *
  794.      * @throws UnsupportedOperationException
  795.      *             always
  796.      */
  797.     @Override
  798.     public void remove() {
  799.         throw new UnsupportedOperationException("remove() is unsupported");
  800.     }

  801.     /**
  802.      * Resets this tokenizer, forgetting all parsing and iteration already completed.
  803.      * <p>
  804.      * This method allows the same tokenizer to be reused for the same String.
  805.      * </p>
  806.      *
  807.      * @return this, to enable chaining
  808.      */
  809.     public StringTokenizer reset() {
  810.         tokenPos = 0;
  811.         tokens = null;
  812.         return this;
  813.     }

  814.     /**
  815.      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
  816.      * same settings on multiple input lines.
  817.      *
  818.      * @param input
  819.      *            the new character array to tokenize, not cloned, null sets no text to parse
  820.      * @return this, to enable chaining
  821.      */
  822.     public StringTokenizer reset(final char[] input) {
  823.         reset();
  824.         this.chars = input != null ? input.clone() : null;
  825.         return this;
  826.     }

  827.     /**
  828.      * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
  829.      * same settings on multiple input lines.
  830.      *
  831.      * @param input
  832.      *            the new string to tokenize, null sets no text to parse
  833.      * @return this, to enable chaining
  834.      */
  835.     public StringTokenizer reset(final String input) {
  836.         reset();
  837.         this.chars = input != null ? input.toCharArray() : null;
  838.         return this;
  839.     }

  840.     /**
  841.      * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
  842.      *
  843.      * @param obj
  844.      *            this parameter ignored.
  845.      * @throws UnsupportedOperationException
  846.      *             always
  847.      */
  848.     @Override
  849.     public void set(final String obj) {
  850.         throw new UnsupportedOperationException("set() is unsupported");
  851.     }

  852.     /**
  853.      * Sets the field delimiter character.
  854.      *
  855.      * @param delim
  856.      *            the delimiter character to use
  857.      * @return this, to enable chaining
  858.      */
  859.     public StringTokenizer setDelimiterChar(final char delim) {
  860.         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
  861.     }

  862.     /**
  863.      * Sets the field delimiter matcher.
  864.      * <p>
  865.      * The delimiter is used to separate one token from another.
  866.      * </p>
  867.      *
  868.      * @param delim
  869.      *            the delimiter matcher to use
  870.      * @return this, to enable chaining
  871.      */
  872.     public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
  873.         this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
  874.         return this;
  875.     }

  876.     /**
  877.      * Sets the field delimiter string.
  878.      *
  879.      * @param delim
  880.      *            the delimiter string to use
  881.      * @return this, to enable chaining
  882.      */
  883.     public StringTokenizer setDelimiterString(final String delim) {
  884.         return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
  885.     }

  886.     /**
  887.      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
  888.      *
  889.      * @param emptyAsNull
  890.      *            whether empty tokens are returned as null
  891.      * @return this, to enable chaining
  892.      */
  893.     public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
  894.         this.emptyAsNull = emptyAsNull;
  895.         return this;
  896.     }

  897.     /**
  898.      * Sets the character to ignore.
  899.      * <p>
  900.      * This character is ignored when parsing the String, unless it is within a quoted region.
  901.      * </p>
  902.      *
  903.      * @param ignored
  904.      *            the ignored character to use
  905.      * @return this, to enable chaining
  906.      */
  907.     public StringTokenizer setIgnoredChar(final char ignored) {
  908.         return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
  909.     }

  910.     /**
  911.      * Sets the matcher for characters to ignore.
  912.      * <p>
  913.      * These characters are ignored when parsing the String, unless they are within a quoted region.
  914.      * </p>
  915.      *
  916.      * @param ignored
  917.      *            the ignored matcher to use, null ignored
  918.      * @return this, to enable chaining
  919.      */
  920.     public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
  921.         if (ignored != null) {
  922.             this.ignoredMatcher = ignored;
  923.         }
  924.         return this;
  925.     }

  926.     /**
  927.      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
  928.      *
  929.      * @param ignoreEmptyTokens
  930.      *            whether empty tokens are not returned
  931.      * @return this, to enable chaining
  932.      */
  933.     public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
  934.         this.ignoreEmptyTokens = ignoreEmptyTokens;
  935.         return this;
  936.     }

  937.     /**
  938.      * Sets the quote character to use.
  939.      * <p>
  940.      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
  941.      * </p>
  942.      *
  943.      * @param quote
  944.      *            the quote character to use
  945.      * @return this, to enable chaining
  946.      */
  947.     public StringTokenizer setQuoteChar(final char quote) {
  948.         return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
  949.     }

  950.     /**
  951.      * Sets the quote matcher to use.
  952.      * <p>
  953.      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
  954.      * </p>
  955.      *
  956.      * @param quote
  957.      *            the quote matcher to use, null ignored
  958.      * @return this, to enable chaining
  959.      */
  960.     public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
  961.         if (quote != null) {
  962.             this.quoteMatcher = quote;
  963.         }
  964.         return this;
  965.     }

  966.     /**
  967.      * Sets the matcher for characters to trim.
  968.      * <p>
  969.      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
  970.      *
  971.      * @param trimmer
  972.      *            the trimmer matcher to use, null ignored
  973.      * @return this, to enable chaining
  974.      */
  975.     public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
  976.         if (trimmer != null) {
  977.             this.trimmerMatcher = trimmer;
  978.         }
  979.         return this;
  980.     }

  981.     /**
  982.      * Gets the number of tokens found in the String.
  983.      *
  984.      * @return The number of matched tokens
  985.      */
  986.     public int size() {
  987.         checkTokenized();
  988.         return tokens.length;
  989.     }

  990.     /**
  991.      * Internal method to performs the tokenization.
  992.      * <p>
  993.      * Most users of this class do not need to call this method. This method will be called automatically by other
  994.      * (public) methods when required.
  995.      * </p>
  996.      * <p>
  997.      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
  998.      * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
  999.      * strings. It is also be possible to filter the results.
  1000.      * </p>
  1001.      * <p>
  1002.      * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
  1003.      * method, however a subclass may pass other values, or even an entirely different array.
  1004.      * </p>
  1005.      *
  1006.      * @param srcChars
  1007.      *            the character array being tokenized, may be null
  1008.      * @param offset
  1009.      *            the start position within the character array, must be valid
  1010.      * @param count
  1011.      *            the number of characters to tokenize, must be valid
  1012.      * @return The modifiable list of String tokens, unmodifiable if null array or zero count
  1013.      */
  1014.     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
  1015.         if (srcChars == null || count == 0) {
  1016.             return Collections.emptyList();
  1017.         }
  1018.         final TextStringBuilder buf = new TextStringBuilder();
  1019.         final List<String> tokenList = new ArrayList<>();
  1020.         int pos = offset;

  1021.         // loop around the entire buffer
  1022.         while (pos >= 0 && pos < count) {
  1023.             // find next token
  1024.             pos = readNextToken(srcChars, pos, count, buf, tokenList);

  1025.             // handle case where end of string is a delimiter
  1026.             if (pos >= count) {
  1027.                 addToken(tokenList, StringUtils.EMPTY);
  1028.             }
  1029.         }
  1030.         return tokenList;
  1031.     }

  1032.     /**
  1033.      * Gets the String content that the tokenizer is parsing.
  1034.      *
  1035.      * @return The string content being parsed
  1036.      */
  1037.     @Override
  1038.     public String toString() {
  1039.         if (tokens == null) {
  1040.             return "StringTokenizer[not tokenized yet]";
  1041.         }
  1042.         return "StringTokenizer" + getTokenList();
  1043.     }

  1044. }