StrTokenizer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.lang3.text;

  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.List;
  22. import java.util.ListIterator;
  23. import java.util.NoSuchElementException;
  24. import java.util.StringTokenizer;

  25. import org.apache.commons.lang3.ArrayUtils;
  26. import org.apache.commons.lang3.StringUtils;

  27. /**
  28.  * Tokenizes a string based on delimiters (separators)
  29.  * and supporting quoting and ignored character concepts.
  30.  * <p>
  31.  * This class can split a String into many smaller strings. It aims
  32.  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
  33.  * however it offers much more control and flexibility including implementing
  34.  * the {@link ListIterator} interface. By default, it is set up
  35.  * like {@link StringTokenizer}.
  36.  * </p>
  37.  * <p>
  38.  * The input String is split into a number of <em>tokens</em>.
  39.  * Each token is separated from the next String by a <em>delimiter</em>.
  40.  * One or more delimiter characters must be specified.
  41.  * </p>
  42.  * <p>
  43.  * Each token may be surrounded by quotes.
  44.  * The <em>quote</em> matcher specifies the quote character(s).
  45.  * A quote may be escaped within a quoted section by duplicating itself.
  46.  * </p>
  47.  * <p>
  48.  * Between each token and the delimiter are potentially characters that need trimming.
  49.  * The <em>trimmer</em> matcher specifies these characters.
  50.  * One usage might be to trim whitespace characters.
  51.  * </p>
  52.  * <p>
  53.  * At any point outside the quotes there might potentially be invalid characters.
  54.  * The <em>ignored</em> matcher specifies these characters to be removed.
  55.  * One usage might be to remove new line characters.
  56.  * </p>
  57.  * <p>
  58.  * Empty tokens may be removed or returned as null.
  59.  * </p>
  60.  * <pre>
  61.  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
  62.  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
  63.  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
  64.  * </pre>
  65.  *
  66.  * <table>
  67.  *  <caption>StrTokenizer properties and options</caption>
  68.  *  <tr>
  69.  *   <th>Property</th><th>Type</th><th>Default</th>
  70.  *  </tr>
  71.  *  <tr>
  72.  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
  73.  *  </tr>
  74.  *  <tr>
  75.  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
  76.  *  </tr>
  77.  *  <tr>
  78.  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
  79.  *  </tr>
  80.  *  <tr>
  81.  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
  82.  *  </tr>
  83.  *  <tr>
  84.  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
  85.  *  </tr>
  86.  * </table>
  87.  *
  88.  * @since 2.2
  89.  * @deprecated As of 3.6, use Apache Commons Text
  90.  * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
  91.  * StringTokenizer</a> instead
  92.  */
  93. @Deprecated
  94. public class StrTokenizer implements ListIterator<String>, Cloneable {

  95.     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
  96.     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
  97.     static {
  98.         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
  99.         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
  100.         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
  101.         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
  102.         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
  103.         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  104.         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);

  105.         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
  106.         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
  107.         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
  108.         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
  109.         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
  110.         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  111.         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
  112.     }

  113.     /**
  114.      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
  115.      *
  116.      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
  117.      */
  118.     private static StrTokenizer getCSVClone() {
  119.         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
  120.     }
  121.     /**
  122.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  123.      * initializing it with the given input.  The default for CSV processing
  124.      * will be trim whitespace from both ends (which can be overridden with
  125.      * the setTrimmer method).
  126.      * <p>
  127.      * You must call a "reset" method to set the string which you want to parse.
  128.      * </p>
  129.      * @return a new tokenizer instance which parses Comma Separated Value strings
  130.      */
  131.     public static StrTokenizer getCSVInstance() {
  132.         return getCSVClone();
  133.     }
  134.     /**
  135.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  136.      * initializing it with the given input.  The default for CSV processing
  137.      * will be trim whitespace from both ends (which can be overridden with
  138.      * the setTrimmer method).
  139.      *
  140.      * @param input  the text to parse
  141.      * @return a new tokenizer instance which parses Comma Separated Value strings
  142.      */
  143.     public static StrTokenizer getCSVInstance(final char[] input) {
  144.         final StrTokenizer tok = getCSVClone();
  145.         tok.reset(input);
  146.         return tok;
  147.     }

  148.     /**
  149.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  150.      * initializing it with the given input.  The default for CSV processing
  151.      * will be trim whitespace from both ends (which can be overridden with
  152.      * the setTrimmer method).
  153.      *
  154.      * @param input  the text to parse
  155.      * @return a new tokenizer instance which parses Comma Separated Value strings
  156.      */
  157.     public static StrTokenizer getCSVInstance(final String input) {
  158.         final StrTokenizer tok = getCSVClone();
  159.         tok.reset(input);
  160.         return tok;
  161.     }
  162.     /**
  163.      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
  164.      *
  165.      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
  166.      */
  167.     private static StrTokenizer getTSVClone() {
  168.         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
  169.     }
  170.     /**
  171.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  172.      * The default for CSV processing will be trim whitespace from both ends
  173.      * (which can be overridden with the setTrimmer method).
  174.      * <p>
  175.      * You must call a "reset" method to set the string which you want to parse.
  176.      * </p>
  177.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  178.      */
  179.     public static StrTokenizer getTSVInstance() {
  180.         return getTSVClone();
  181.     }
  182.     /**
  183.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  184.      * The default for CSV processing will be trim whitespace from both ends
  185.      * (which can be overridden with the setTrimmer method).
  186.      * @param input  the string to parse
  187.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  188.      */
  189.     public static StrTokenizer getTSVInstance(final char[] input) {
  190.         final StrTokenizer tok = getTSVClone();
  191.         tok.reset(input);
  192.         return tok;
  193.     }

  194.     /**
  195.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  196.      * The default for CSV processing will be trim whitespace from both ends
  197.      * (which can be overridden with the setTrimmer method).
  198.      * @param input  the string to parse
  199.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  200.      */
  201.     public static StrTokenizer getTSVInstance(final String input) {
  202.         final StrTokenizer tok = getTSVClone();
  203.         tok.reset(input);
  204.         return tok;
  205.     }
  206.     /** The text to work on. */
  207.     private char[] chars;

  208.     /** The parsed tokens */
  209.     private String[] tokens;

  210.     /** The current iteration position */
  211.     private int tokenPos;

  212.     /** The delimiter matcher */
  213.     private StrMatcher delimMatcher = StrMatcher.splitMatcher();

  214.     /** The quote matcher */
  215.     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();

  216.     /** The ignored matcher */
  217.     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();

  218.     /** The trimmer matcher */
  219.     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();

  220.     /** Whether to return empty tokens as null */
  221.     private boolean emptyAsNull;

  222.     /** Whether to ignore empty tokens */
  223.     private boolean ignoreEmptyTokens = true;

  224.     /**
  225.      * Constructs a tokenizer splitting on space, tab, newline and formfeed
  226.      * as per StringTokenizer, but with no text to tokenize.
  227.      * <p>
  228.      * This constructor is normally used with {@link #reset(String)}.
  229.      * </p>
  230.      */
  231.     public StrTokenizer() {
  232.         this.chars = null;
  233.     }

  234.     /**
  235.      * Constructs a tokenizer splitting on space, tab, newline and formfeed
  236.      * as per StringTokenizer.
  237.      *
  238.      * @param input  the string which is to be parsed, not cloned
  239.      */
  240.     public StrTokenizer(final char[] input) {
  241.         this.chars = ArrayUtils.clone(input);
  242.     }

  243.     /**
  244.      * Constructs a tokenizer splitting on the specified character.
  245.      *
  246.      * @param input  the string which is to be parsed, not cloned
  247.      * @param delim the field delimiter character
  248.      */
  249.     public StrTokenizer(final char[] input, final char delim) {
  250.         this(input);
  251.         setDelimiterChar(delim);
  252.     }

  253.     /**
  254.      * Constructs a tokenizer splitting on the specified delimiter character
  255.      * and handling quotes using the specified quote character.
  256.      *
  257.      * @param input  the string which is to be parsed, not cloned
  258.      * @param delim  the field delimiter character
  259.      * @param quote  the field quoted string character
  260.      */
  261.     public StrTokenizer(final char[] input, final char delim, final char quote) {
  262.         this(input, delim);
  263.         setQuoteChar(quote);
  264.     }

  265.     /**
  266.      * Constructs a tokenizer splitting on the specified string.
  267.      *
  268.      * @param input  the string which is to be parsed, not cloned
  269.      * @param delim the field delimiter string
  270.      */
  271.     public StrTokenizer(final char[] input, final String delim) {
  272.         this(input);
  273.         setDelimiterString(delim);
  274.     }

  275.     /**
  276.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  277.      *
  278.      * @param input  the string which is to be parsed, not cloned
  279.      * @param delim  the field delimiter matcher
  280.      */
  281.     public StrTokenizer(final char[] input, final StrMatcher delim) {
  282.         this(input);
  283.         setDelimiterMatcher(delim);
  284.     }

  285.     /**
  286.      * Constructs a tokenizer splitting using the specified delimiter matcher
  287.      * and handling quotes using the specified quote matcher.
  288.      *
  289.      * @param input  the string which is to be parsed, not cloned
  290.      * @param delim  the field delimiter character
  291.      * @param quote  the field quoted string character
  292.      */
  293.     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
  294.         this(input, delim);
  295.         setQuoteMatcher(quote);
  296.     }

  297.     /**
  298.      * Constructs a tokenizer splitting on space, tab, newline and formfeed
  299.      * as per StringTokenizer.
  300.      *
  301.      * @param input  the string which is to be parsed
  302.      */
  303.     public StrTokenizer(final String input) {
  304.         if (input != null) {
  305.             chars = input.toCharArray();
  306.         } else {
  307.             chars = null;
  308.         }
  309.     }

  310.     /**
  311.      * Constructs a tokenizer splitting on the specified delimiter character.
  312.      *
  313.      * @param input  the string which is to be parsed
  314.      * @param delim  the field delimiter character
  315.      */
  316.     public StrTokenizer(final String input, final char delim) {
  317.         this(input);
  318.         setDelimiterChar(delim);
  319.     }

  320.     /**
  321.      * Constructs a tokenizer splitting on the specified delimiter character
  322.      * and handling quotes using the specified quote character.
  323.      *
  324.      * @param input  the string which is to be parsed
  325.      * @param delim  the field delimiter character
  326.      * @param quote  the field quoted string character
  327.      */
  328.     public StrTokenizer(final String input, final char delim, final char quote) {
  329.         this(input, delim);
  330.         setQuoteChar(quote);
  331.     }

  332.     /**
  333.      * Constructs a tokenizer splitting on the specified delimiter string.
  334.      *
  335.      * @param input  the string which is to be parsed
  336.      * @param delim  the field delimiter string
  337.      */
  338.     public StrTokenizer(final String input, final String delim) {
  339.         this(input);
  340.         setDelimiterString(delim);
  341.     }

  342.     /**
  343.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  344.      *
  345.      * @param input  the string which is to be parsed
  346.      * @param delim  the field delimiter matcher
  347.      */
  348.     public StrTokenizer(final String input, final StrMatcher delim) {
  349.         this(input);
  350.         setDelimiterMatcher(delim);
  351.     }

  352.     /**
  353.      * Constructs a tokenizer splitting using the specified delimiter matcher
  354.      * and handling quotes using the specified quote matcher.
  355.      *
  356.      * @param input  the string which is to be parsed
  357.      * @param delim  the field delimiter matcher
  358.      * @param quote  the field quoted string matcher
  359.      */
  360.     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
  361.         this(input, delim);
  362.         setQuoteMatcher(quote);
  363.     }

  364.     /**
  365.      * Unsupported ListIterator operation.
  366.      * @param obj this parameter ignored.
  367.      * @throws UnsupportedOperationException always
  368.      */
  369.     @Override
  370.     public void add(final String obj) {
  371.         throw new UnsupportedOperationException("add() is unsupported");
  372.     }

  373.     /**
  374.      * Adds a token to a list, paying attention to the parameters we've set.
  375.      *
  376.      * @param list  the list to add to
  377.      * @param tok  the token to add
  378.      */
  379.     private void addToken(final List<String> list, String tok) {
  380.         if (StringUtils.isEmpty(tok)) {
  381.             if (isIgnoreEmptyTokens()) {
  382.                 return;
  383.             }
  384.             if (isEmptyTokenAsNull()) {
  385.                 tok = null;
  386.             }
  387.         }
  388.         list.add(tok);
  389.     }

  390.     /**
  391.      * Checks if tokenization has been done, and if not then do it.
  392.      */
  393.     private void checkTokenized() {
  394.         if (tokens == null) {
  395.             if (chars == null) {
  396.                 // still call tokenize as subclass may do some work
  397.                 final List<String> split = tokenize(null, 0, 0);
  398.                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
  399.             } else {
  400.                 final List<String> split = tokenize(chars, 0, chars.length);
  401.                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
  402.             }
  403.         }
  404.     }

  405.     /**
  406.      * Creates a new instance of this Tokenizer. The new instance is reset so
  407.      * that it will be at the start of the token list.
  408.      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
  409.      *
  410.      * @return a new instance of this Tokenizer which has been reset.
  411.      */
  412.     @Override
  413.     public Object clone() {
  414.         try {
  415.             return cloneReset();
  416.         } catch (final CloneNotSupportedException ex) {
  417.             return null;
  418.         }
  419.     }

  420.     /**
  421.      * Creates a new instance of this Tokenizer. The new instance is reset so that
  422.      * it will be at the start of the token list.
  423.      *
  424.      * @return a new instance of this Tokenizer which has been reset.
  425.      * @throws CloneNotSupportedException if there is a problem cloning
  426.      */
  427.     Object cloneReset() throws CloneNotSupportedException {
  428.         // this method exists to enable 100% test coverage
  429.         final StrTokenizer cloned = (StrTokenizer) super.clone();
  430.         if (cloned.chars != null) {
  431.             cloned.chars = cloned.chars.clone();
  432.         }
  433.         cloned.reset();
  434.         return cloned;
  435.     }

  436.     /**
  437.      * Gets the String content that the tokenizer is parsing.
  438.      *
  439.      * @return the string content being parsed
  440.      */
  441.     public String getContent() {
  442.         if (chars == null) {
  443.             return null;
  444.         }
  445.         return new String(chars);
  446.     }

  447.     /**
  448.      * Gets the field delimiter matcher.
  449.      *
  450.      * @return the delimiter matcher in use
  451.      */
  452.     public StrMatcher getDelimiterMatcher() {
  453.         return this.delimMatcher;
  454.     }

  455.     // Ignored
  456.     /**
  457.      * Gets the ignored character matcher.
  458.      * <p>
  459.      * These characters are ignored when parsing the String, unless they are
  460.      * within a quoted region.
  461.      * The default value is not to ignore anything.
  462.      * </p>
  463.      *
  464.      * @return the ignored matcher in use
  465.      */
  466.     public StrMatcher getIgnoredMatcher() {
  467.         return ignoredMatcher;
  468.     }

  469.     /**
  470.      * Gets the quote matcher currently in use.
  471.      * <p>
  472.      * The quote character is used to wrap data between the tokens.
  473.      * This enables delimiters to be entered as data.
  474.      * The default value is '"' (double quote).
  475.      * </p>
  476.      *
  477.      * @return the quote matcher in use
  478.      */
  479.     public StrMatcher getQuoteMatcher() {
  480.         return quoteMatcher;
  481.     }

  482.     /**
  483.      * Gets a copy of the full token list as an independent modifiable array.
  484.      *
  485.      * @return the tokens as a String array
  486.      */
  487.     public String[] getTokenArray() {
  488.         checkTokenized();
  489.         return tokens.clone();
  490.     }

  491.     /**
  492.      * Gets a copy of the full token list as an independent modifiable list.
  493.      *
  494.      * @return the tokens as a String array
  495.      */
  496.     public List<String> getTokenList() {
  497.         checkTokenized();
  498.         final List<String> list = new ArrayList<>(tokens.length);
  499.         list.addAll(Arrays.asList(tokens));
  500.         return list;
  501.     }

  502.     /**
  503.      * Gets the trimmer character matcher.
  504.      * <p>
  505.      * These characters are trimmed off on each side of the delimiter
  506.      * until the token or quote is found.
  507.      * The default value is not to trim anything.
  508.      * </p>
  509.      *
  510.      * @return the trimmer matcher in use
  511.      */
  512.     public StrMatcher getTrimmerMatcher() {
  513.         return trimmerMatcher;
  514.     }

  515.     /**
  516.      * Checks whether there are any more tokens.
  517.      *
  518.      * @return true if there are more tokens
  519.      */
  520.     @Override
  521.     public boolean hasNext() {
  522.         checkTokenized();
  523.         return tokenPos < tokens.length;
  524.     }

  525.     /**
  526.      * Checks whether there are any previous tokens that can be iterated to.
  527.      *
  528.      * @return true if there are previous tokens
  529.      */
  530.     @Override
  531.     public boolean hasPrevious() {
  532.         checkTokenized();
  533.         return tokenPos > 0;
  534.     }

  535.     /**
  536.      * Gets whether the tokenizer currently returns empty tokens as null.
  537.      * The default for this property is false.
  538.      *
  539.      * @return true if empty tokens are returned as null
  540.      */
  541.     public boolean isEmptyTokenAsNull() {
  542.         return this.emptyAsNull;
  543.     }

  544.     /**
  545.      * Gets whether the tokenizer currently ignores empty tokens.
  546.      * The default for this property is true.
  547.      *
  548.      * @return true if empty tokens are not returned
  549.      */
  550.     public boolean isIgnoreEmptyTokens() {
  551.         return ignoreEmptyTokens;
  552.     }

  553.     /**
  554.      * Checks if the characters at the index specified match the quote
  555.      * already matched in readNextToken().
  556.      *
  557.      * @param srcChars  the character array being tokenized
  558.      * @param pos  the position to check for a quote
  559.      * @param len  the length of the character array being tokenized
  560.      * @param quoteStart  the start position of the matched quote, 0 if no quoting
  561.      * @param quoteLen  the length of the matched quote, 0 if no quoting
  562.      * @return true if a quote is matched
  563.      */
  564.     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
  565.         for (int i = 0; i < quoteLen; i++) {
  566.             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
  567.                 return false;
  568.             }
  569.         }
  570.         return true;
  571.     }

  572.     /**
  573.      * Gets the next token.
  574.      *
  575.      * @return the next String token
  576.      * @throws NoSuchElementException if there are no more elements
  577.      */
  578.     @Override
  579.     public String next() {
  580.         if (hasNext()) {
  581.             return tokens[tokenPos++];
  582.         }
  583.         throw new NoSuchElementException();
  584.     }

  585.     /**
  586.      * Gets the index of the next token to return.
  587.      *
  588.      * @return the next token index
  589.      */
  590.     @Override
  591.     public int nextIndex() {
  592.         return tokenPos;
  593.     }

  594.     /**
  595.      * Gets the next token from the String.
  596.      * Equivalent to {@link #next()} except it returns null rather than
  597.      * throwing {@link NoSuchElementException} when no tokens remain.
  598.      *
  599.      * @return the next sequential token, or null when no more tokens are found
  600.      */
  601.     public String nextToken() {
  602.         if (hasNext()) {
  603.             return tokens[tokenPos++];
  604.         }
  605.         return null;
  606.     }

  607.     /**
  608.      * Gets the token previous to the last returned token.
  609.      *
  610.      * @return the previous token
  611.      */
  612.     @Override
  613.     public String previous() {
  614.         if (hasPrevious()) {
  615.             return tokens[--tokenPos];
  616.         }
  617.         throw new NoSuchElementException();
  618.     }

  619.     /**
  620.      * Gets the index of the previous token.
  621.      *
  622.      * @return the previous token index
  623.      */
  624.     @Override
  625.     public int previousIndex() {
  626.         return tokenPos - 1;
  627.     }

  628.     /**
  629.      * Gets the previous token from the String.
  630.      *
  631.      * @return the previous sequential token, or null when no more tokens are found
  632.      */
  633.     public String previousToken() {
  634.         if (hasPrevious()) {
  635.             return tokens[--tokenPos];
  636.         }
  637.         return null;
  638.     }

  639.     /**
  640.      * Reads character by character through the String to get the next token.
  641.      *
  642.      * @param srcChars  the character array being tokenized
  643.      * @param start  the first character of field
  644.      * @param len  the length of the character array being tokenized
  645.      * @param workArea  a temporary work area
  646.      * @param tokenList  the list of parsed tokens
  647.      * @return the starting position of the next field (the character
  648.      *  immediately after the delimiter), or -1 if end of string found
  649.      */
  650.     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
  651.         // skip all leading whitespace, unless it is the
  652.         // field delimiter or the quote character
  653.         while (start < len) {
  654.             final int removeLen = Math.max(
  655.                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
  656.                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
  657.             if (removeLen == 0 ||
  658.                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
  659.                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
  660.                 break;
  661.             }
  662.             start += removeLen;
  663.         }

  664.         // handle reaching end
  665.         if (start >= len) {
  666.             addToken(tokenList, StringUtils.EMPTY);
  667.             return -1;
  668.         }

  669.         // handle empty token
  670.         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
  671.         if (delimLen > 0) {
  672.             addToken(tokenList, StringUtils.EMPTY);
  673.             return start + delimLen;
  674.         }

  675.         // handle found token
  676.         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
  677.         if (quoteLen > 0) {
  678.             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
  679.         }
  680.         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
  681.     }

  682.     /**
  683.      * Reads a possibly quoted string token.
  684.      *
  685.      * @param srcChars  the character array being tokenized
  686.      * @param start  the first character of field
  687.      * @param len  the length of the character array being tokenized
  688.      * @param workArea  a temporary work area
  689.      * @param tokenList  the list of parsed tokens
  690.      * @param quoteStart  the start position of the matched quote, 0 if no quoting
  691.      * @param quoteLen  the length of the matched quote, 0 if no quoting
  692.      * @return the starting position of the next field (the character
  693.      *  immediately after the delimiter, or if end of string found,
  694.      *  then the length of string
  695.      */
  696.     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
  697.                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
  698.         // Loop until we've found the end of the quoted
  699.         // string or the end of the input
  700.         workArea.clear();
  701.         int pos = start;
  702.         boolean quoting = quoteLen > 0;
  703.         int trimStart = 0;

  704.         while (pos < len) {
  705.             // quoting mode can occur several times throughout a string
  706.             // we must switch between quoting and non-quoting until we
  707.             // encounter a non-quoted delimiter, or end of string
  708.             if (quoting) {
  709.                 // In quoting mode

  710.                 // If we've found a quote character, see if it's
  711.                 // followed by a second quote.  If so, then we need
  712.                 // to actually put the quote character into the token
  713.                 // rather than end the token.
  714.                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  715.                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
  716.                         // matched pair of quotes, thus an escaped quote
  717.                         workArea.append(srcChars, pos, quoteLen);
  718.                         pos += quoteLen * 2;
  719.                         trimStart = workArea.size();
  720.                         continue;
  721.                     }

  722.                     // end of quoting
  723.                     quoting = false;
  724.                     pos += quoteLen;
  725.                     continue;
  726.                 }

  727.             } else {
  728.                 // Not in quoting mode

  729.                 // check for delimiter, and thus end of token
  730.                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
  731.                 if (delimLen > 0) {
  732.                     // return condition when end of token found
  733.                     addToken(tokenList, workArea.substring(0, trimStart));
  734.                     return pos + delimLen;
  735.                 }

  736.                 // check for quote, and thus back into quoting mode
  737.                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  738.                     quoting = true;
  739.                     pos += quoteLen;
  740.                     continue;
  741.                 }

  742.                 // check for ignored (outside quotes), and ignore
  743.                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
  744.                 if (ignoredLen > 0) {
  745.                     pos += ignoredLen;
  746.                     continue;
  747.                 }

  748.                 // check for trimmed character
  749.                 // don't yet know if it's at the end, so copy to workArea
  750.                 // use trimStart to keep track of trim at the end
  751.                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
  752.                 if (trimmedLen > 0) {
  753.                     workArea.append(srcChars, pos, trimmedLen);
  754.                     pos += trimmedLen;
  755.                     continue;
  756.                 }
  757.             }
  758.             // copy regular character from inside quotes
  759.             workArea.append(srcChars[pos++]);
  760.             trimStart = workArea.size();
  761.         }

  762.         // return condition when end of string found
  763.         addToken(tokenList, workArea.substring(0, trimStart));
  764.         return -1;
  765.     }

  766.     /**
  767.      * Unsupported ListIterator operation.
  768.      *
  769.      * @throws UnsupportedOperationException always
  770.      */
  771.     @Override
  772.     public void remove() {
  773.         throw new UnsupportedOperationException("remove() is unsupported");
  774.     }

  775.     /**
  776.      * Resets this tokenizer, forgetting all parsing and iteration already completed.
  777.      * <p>
  778.      * This method allows the same tokenizer to be reused for the same String.
  779.      * </p>
  780.      *
  781.      * @return this, to enable chaining
  782.      */
  783.     public StrTokenizer reset() {
  784.         tokenPos = 0;
  785.         tokens = null;
  786.         return this;
  787.     }

  788.     /**
  789.      * Reset this tokenizer, giving it a new input string to parse.
  790.      * In this manner you can re-use a tokenizer with the same settings
  791.      * on multiple input lines.
  792.      *
  793.      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
  794.      * @return this, to enable chaining
  795.      */
  796.     public StrTokenizer reset(final char[] input) {
  797.         reset();
  798.         this.chars = ArrayUtils.clone(input);
  799.         return this;
  800.     }

  801.     /**
  802.      * Reset this tokenizer, giving it a new input string to parse.
  803.      * In this manner you can re-use a tokenizer with the same settings
  804.      * on multiple input lines.
  805.      *
  806.      * @param input  the new string to tokenize, null sets no text to parse
  807.      * @return this, to enable chaining
  808.      */
  809.     public StrTokenizer reset(final String input) {
  810.         reset();
  811.         if (input != null) {
  812.             this.chars = input.toCharArray();
  813.         } else {
  814.             this.chars = null;
  815.         }
  816.         return this;
  817.     }

  818.     /**
  819.      * Unsupported ListIterator operation.
  820.      * @param obj this parameter ignored.
  821.      * @throws UnsupportedOperationException always
  822.      */
  823.     @Override
  824.     public void set(final String obj) {
  825.         throw new UnsupportedOperationException("set() is unsupported");
  826.     }

  827.     /**
  828.      * Sets the field delimiter character.
  829.      *
  830.      * @param delim  the delimiter character to use
  831.      * @return this, to enable chaining
  832.      */
  833.     public StrTokenizer setDelimiterChar(final char delim) {
  834.         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
  835.     }

  836.     /**
  837.      * Sets the field delimiter matcher.
  838.      * <p>
  839.      * The delimiter is used to separate one token from another.
  840.      * </p>
  841.      *
  842.      * @param delim  the delimiter matcher to use
  843.      * @return this, to enable chaining
  844.      */
  845.     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
  846.         if (delim == null) {
  847.             this.delimMatcher = StrMatcher.noneMatcher();
  848.         } else {
  849.             this.delimMatcher = delim;
  850.         }
  851.         return this;
  852.     }

  853.     /**
  854.      * Sets the field delimiter string.
  855.      *
  856.      * @param delim  the delimiter string to use
  857.      * @return this, to enable chaining
  858.      */
  859.     public StrTokenizer setDelimiterString(final String delim) {
  860.         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
  861.     }

  862.     /**
  863.      * Sets whether the tokenizer should return empty tokens as null.
  864.      * The default for this property is false.
  865.      *
  866.      * @param emptyAsNull  whether empty tokens are returned as null
  867.      * @return this, to enable chaining
  868.      */
  869.     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
  870.         this.emptyAsNull = emptyAsNull;
  871.         return this;
  872.     }

  873.     /**
  874.      * Sets the character to ignore.
  875.      * <p>
  876.      * This character is ignored when parsing the String, unless it is
  877.      * within a quoted region.
  878.      *
  879.      * @param ignored  the ignored character to use
  880.      * @return this, to enable chaining
  881.      */
  882.     public StrTokenizer setIgnoredChar(final char ignored) {
  883.         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
  884.     }

  885.     /**
  886.      * Sets the matcher for characters to ignore.
  887.      * <p>
  888.      * These characters are ignored when parsing the String, unless they are
  889.      * within a quoted region.
  890.      * </p>
  891.      *
  892.      * @param ignored  the ignored matcher to use, null ignored
  893.      * @return this, to enable chaining
  894.      */
  895.     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
  896.         if (ignored != null) {
  897.             this.ignoredMatcher = ignored;
  898.         }
  899.         return this;
  900.     }

  901.     /**
  902.      * Sets whether the tokenizer should ignore and not return empty tokens.
  903.      * The default for this property is true.
  904.      *
  905.      * @param ignoreEmptyTokens  whether empty tokens are not returned
  906.      * @return this, to enable chaining
  907.      */
  908.     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
  909.         this.ignoreEmptyTokens = ignoreEmptyTokens;
  910.         return this;
  911.     }

  912.     /**
  913.      * Sets the quote character to use.
  914.      * <p>
  915.      * The quote character is used to wrap data between the tokens.
  916.      * This enables delimiters to be entered as data.
  917.      * </p>
  918.      *
  919.      * @param quote  the quote character to use
  920.      * @return this, to enable chaining
  921.      */
  922.     public StrTokenizer setQuoteChar(final char quote) {
  923.         return setQuoteMatcher(StrMatcher.charMatcher(quote));
  924.     }

  925.     /**
  926.      * Sets the quote matcher to use.
  927.      * <p>
  928.      * The quote character is used to wrap data between the tokens.
  929.      * This enables delimiters to be entered as data.
  930.      * </p>
  931.      *
  932.      * @param quote  the quote matcher to use, null ignored
  933.      * @return this, to enable chaining
  934.      */
  935.     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
  936.         if (quote != null) {
  937.             this.quoteMatcher = quote;
  938.         }
  939.         return this;
  940.     }

  941.     /**
  942.      * Sets the matcher for characters to trim.
  943.      * <p>
  944.      * These characters are trimmed off on each side of the delimiter
  945.      * until the token or quote is found.
  946.      * </p>
  947.      *
  948.      * @param trimmer  the trimmer matcher to use, null ignored
  949.      * @return this, to enable chaining
  950.      */
  951.     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
  952.         if (trimmer != null) {
  953.             this.trimmerMatcher = trimmer;
  954.         }
  955.         return this;
  956.     }

  957.     // API
  958.     /**
  959.      * Gets the number of tokens found in the String.
  960.      *
  961.      * @return the number of matched tokens
  962.      */
  963.     public int size() {
  964.         checkTokenized();
  965.         return tokens.length;
  966.     }

  967.     /**
  968.      * Internal method to performs the tokenization.
  969.      * <p>
  970.      * Most users of this class do not need to call this method. This method
  971.      * will be called automatically by other (public) methods when required.
  972.      * </p>
  973.      * <p>
  974.      * This method exists to allow subclasses to add code before or after the
  975.      * tokenization. For example, a subclass could alter the character array,
  976.      * offset or count to be parsed, or call the tokenizer multiple times on
  977.      * multiple strings. It is also be possible to filter the results.
  978.      * </p>
  979.      * <p>
  980.      * {@link StrTokenizer} will always pass a zero offset and a count
  981.      * equal to the length of the array to this method, however a subclass
  982.      * may pass other values, or even an entirely different array.
  983.      * </p>
  984.      *
  985.      * @param srcChars  the character array being tokenized, may be null
  986.      * @param offset  the start position within the character array, must be valid
  987.      * @param count  the number of characters to tokenize, must be valid
  988.      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
  989.      */
  990.     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
  991.         if (ArrayUtils.isEmpty(srcChars)) {
  992.             return Collections.emptyList();
  993.         }
  994.         final StrBuilder buf = new StrBuilder();
  995.         final List<String> tokenList = new ArrayList<>();
  996.         int pos = offset;

  997.         // loop around the entire buffer
  998.         while (pos >= 0 && pos < count) {
  999.             // find next token
  1000.             pos = readNextToken(srcChars, pos, count, buf, tokenList);

  1001.             // handle case where end of string is a delimiter
  1002.             if (pos >= count) {
  1003.                 addToken(tokenList, StringUtils.EMPTY);
  1004.             }
  1005.         }
  1006.         return tokenList;
  1007.     }

  1008.     /**
  1009.      * Gets the String content that the tokenizer is parsing.
  1010.      *
  1011.      * @return the string content being parsed
  1012.      */
  1013.     @Override
  1014.     public String toString() {
  1015.         if (tokens == null) {
  1016.             return "StrTokenizer[not tokenized yet]";
  1017.         }
  1018.         return "StrTokenizer" + getTokenList();
  1019.     }

  1020. }