StrTokenizer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text;

  18. import java.util.ArrayList;
  19. import java.util.Collections;
  20. import java.util.List;
  21. import java.util.ListIterator;
  22. import java.util.NoSuchElementException;

  23. import org.apache.commons.lang3.ArrayUtils;
  24. import org.apache.commons.lang3.StringUtils;

  25. /**
  26.  * Tokenizes a string based on delimiters (separators)
  27.  * and supporting quoting and ignored character concepts.
  28.  * <p>
  29.  * This class can split a String into many smaller strings. It aims
  30.  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
  31.  * however it offers much more control and flexibility including implementing
  32.  * the {@code ListIterator} interface. By default, it is set up
  33.  * like {@code StringTokenizer}.
  34.  * <p>
  35.  * The input String is split into a number of <em>tokens</em>.
  36.  * Each token is separated from the next String by a <em>delimiter</em>.
  37.  * One or more delimiter characters must be specified.
  38.  * <p>
  39.  * Each token may be surrounded by quotes.
  40.  * The <em>quote</em> matcher specifies the quote character(s).
  41.  * A quote may be escaped within a quoted section by duplicating itself.
  42.  * <p>
  43.  * Between each token and the delimiter are potentially characters that need trimming.
  44.  * The <em>trimmer</em> matcher specifies these characters.
  45.  * One usage might be to trim whitespace characters.
  46.  * <p>
  47.  * At any point outside the quotes there might potentially be invalid characters.
  48.  * The <em>ignored</em> matcher specifies these characters to be removed.
  49.  * One usage might be to remove new line characters.
  50.  * <p>
  51.  * Empty tokens may be removed or returned as null.
  52.  * <pre>
  53.  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
  54.  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
  55.  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
  56.  * </pre>
  57.  *
  58.  * <table>
  59.  *  <caption>StrTokenizer properties and options</caption>
  60.  *  <tr>
  61.  *   <th>Property</th><th>Type</th><th>Default</th>
  62.  *  </tr>
  63.  *  <tr>
  64.  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
  65.  *  </tr>
  66.  *  <tr>
  67.  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
  68.  *  </tr>
  69.  *  <tr>
  70.  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
  71.  *  </tr>
  72.  *  <tr>
  73.  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
  74.  *  </tr>
  75.  *  <tr>
  76.  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
  77.  *  </tr>
  78.  * </table>
  79.  *
  80.  * @since 1.0
  81.  * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
  82.  */
  83. @Deprecated
  84. public class StrTokenizer implements ListIterator<String>, Cloneable {

  85.     /** Comma separated values tokenizer internal variable. */
  86.     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;

  87.     /** Tab separated values tokenizer internal variable. */
  88.     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;

  89.     static {
  90.         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
  91.         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
  92.         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
  93.         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
  94.         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
  95.         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  96.         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);

  97.         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
  98.         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
  99.         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
  100.         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
  101.         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
  102.         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  103.         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
  104.     }

  105.     /**
  106.      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
  107.      *
  108.      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
  109.      */
  110.     private static StrTokenizer getCSVClone() {
  111.         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
  112.     }

  113.     /**
  114.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  115.      * initializing it with the given input.  The default for CSV processing
  116.      * will be trim whitespace from both ends (which can be overridden with
  117.      * the setTrimmer method).
  118.      * <p>
  119.      * You must call a "reset" method to set the string which you want to parse.
  120.      * </p>
  121.      * @return a new tokenizer instance which parses Comma Separated Value strings
  122.      */
  123.     public static StrTokenizer getCSVInstance() {
  124.         return getCSVClone();
  125.     }

  126.     /**
  127.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  128.      * initializing it with the given input.  The default for CSV processing
  129.      * will be trim whitespace from both ends (which can be overridden with
  130.      * the setTrimmer method).
  131.      *
  132.      * @param input  the text to parse
  133.      * @return a new tokenizer instance which parses Comma Separated Value strings
  134.      */
  135.     public static StrTokenizer getCSVInstance(final char[] input) {
  136.         final StrTokenizer tok = getCSVClone();
  137.         tok.reset(input);
  138.         return tok;
  139.     }

  140.     /**
  141.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  142.      * initializing it with the given input.  The default for CSV processing
  143.      * will be trim whitespace from both ends (which can be overridden with
  144.      * the setTrimmer method).
  145.      *
  146.      * @param input  the text to parse
  147.      * @return a new tokenizer instance which parses Comma Separated Value strings
  148.      */
  149.     public static StrTokenizer getCSVInstance(final String input) {
  150.         final StrTokenizer tok = getCSVClone();
  151.         tok.reset(input);
  152.         return tok;
  153.     }
  154.     /**
  155.      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
  156.      *
  157.      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
  158.      */
  159.     private static StrTokenizer getTSVClone() {
  160.         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
  161.     }

  162.     /**
  163.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  164.      * The default for CSV processing will be trim whitespace from both ends
  165.      * (which can be overridden with the setTrimmer method).
  166.      * <p>
  167.      * You must call a "reset" method to set the string which you want to parse.
  168.      * </p>
  169.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  170.      */
  171.     public static StrTokenizer getTSVInstance() {
  172.         return getTSVClone();
  173.     }

  174.     /**
  175.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  176.      * The default for CSV processing will be trim whitespace from both ends
  177.      * (which can be overridden with the setTrimmer method).
  178.      * @param input  the string to parse
  179.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  180.      */
  181.     public static StrTokenizer getTSVInstance(final char[] input) {
  182.         final StrTokenizer tok = getTSVClone();
  183.         tok.reset(input);
  184.         return tok;
  185.     }

  186.     /**
  187.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  188.      * The default for CSV processing will be trim whitespace from both ends
  189.      * (which can be overridden with the setTrimmer method).
  190.      * @param input  the string to parse
  191.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  192.      */
  193.     public static StrTokenizer getTSVInstance(final String input) {
  194.         final StrTokenizer tok = getTSVClone();
  195.         tok.reset(input);
  196.         return tok;
  197.     }

  198.     /** The text to work on. */
  199.     private char[] chars;

  200.     /** The parsed tokens. */
  201.     private String[] tokens;

  202.     /** The current iteration position. */
  203.     private int tokenPos;

  204.     /** The delimiter matcher. */
  205.     private StrMatcher delimMatcher = StrMatcher.splitMatcher();

  206.     /** The quote matcher. */
  207.     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();

  208.     /** The ignored matcher. */
  209.     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();

  210.     /** The trimmer matcher. */
  211.     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();

  212.     /** Whether to return empty tokens as null. */
  213.     private boolean emptyAsNull;

  214.     /** Whether to ignore empty tokens. */
  215.     private boolean ignoreEmptyTokens = true;

  216.     /**
  217.      * Constructs a tokenizer splitting on space, tab, newline and form feed
  218.      * as per StringTokenizer, but with no text to tokenize.
  219.      * <p>
  220.      * This constructor is normally used with {@link #reset(String)}.
  221.      * </p>
  222.      */
  223.     public StrTokenizer() {
  224.         this.chars = null;
  225.     }

  226.     /**
  227.      * Constructs a tokenizer splitting on space, tab, newline and form feed
  228.      * as per StringTokenizer.
  229.      *
  230.      * @param input  the string which is to be parsed, not cloned
  231.      */
  232.     public StrTokenizer(final char[] input) {
  233.         if (input == null) {
  234.             this.chars = null;
  235.         } else {
  236.             this.chars = input.clone();
  237.         }
  238.     }

  239.     /**
  240.      * Constructs a tokenizer splitting on the specified character.
  241.      *
  242.      * @param input  the string which is to be parsed, not cloned
  243.      * @param delim the field delimiter character
  244.      */
  245.     public StrTokenizer(final char[] input, final char delim) {
  246.         this(input);
  247.         setDelimiterChar(delim);
  248.     }

  249.     /**
  250.      * Constructs a tokenizer splitting on the specified delimiter character
  251.      * and handling quotes using the specified quote character.
  252.      *
  253.      * @param input  the string which is to be parsed, not cloned
  254.      * @param delim  the field delimiter character
  255.      * @param quote  the field quoted string character
  256.      */
  257.     public StrTokenizer(final char[] input, final char delim, final char quote) {
  258.         this(input, delim);
  259.         setQuoteChar(quote);
  260.     }

  261.     /**
  262.      * Constructs a tokenizer splitting on the specified string.
  263.      *
  264.      * @param input  the string which is to be parsed, not cloned
  265.      * @param delim the field delimiter string
  266.      */
  267.     public StrTokenizer(final char[] input, final String delim) {
  268.         this(input);
  269.         setDelimiterString(delim);
  270.     }

  271.     /**
  272.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  273.      *
  274.      * @param input  the string which is to be parsed, not cloned
  275.      * @param delim  the field delimiter matcher
  276.      */
  277.     public StrTokenizer(final char[] input, final StrMatcher delim) {
  278.         this(input);
  279.         setDelimiterMatcher(delim);
  280.     }

  281.     /**
  282.      * Constructs a tokenizer splitting using the specified delimiter matcher
  283.      * and handling quotes using the specified quote matcher.
  284.      *
  285.      * @param input  the string which is to be parsed, not cloned
  286.      * @param delim  the field delimiter character
  287.      * @param quote  the field quoted string character
  288.      */
  289.     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
  290.         this(input, delim);
  291.         setQuoteMatcher(quote);
  292.     }

  293.     /**
  294.      * Constructs a tokenizer splitting on space, tab, newline and form feed
  295.      * as per StringTokenizer.
  296.      *
  297.      * @param input  the string which is to be parsed
  298.      */
  299.     public StrTokenizer(final String input) {
  300.         if (input != null) {
  301.             chars = input.toCharArray();
  302.         } else {
  303.             chars = null;
  304.         }
  305.     }

  306.     /**
  307.      * Constructs a tokenizer splitting on the specified delimiter character.
  308.      *
  309.      * @param input  the string which is to be parsed
  310.      * @param delim  the field delimiter character
  311.      */
  312.     public StrTokenizer(final String input, final char delim) {
  313.         this(input);
  314.         setDelimiterChar(delim);
  315.     }

  316.     /**
  317.      * Constructs a tokenizer splitting on the specified delimiter character
  318.      * and handling quotes using the specified quote character.
  319.      *
  320.      * @param input  the string which is to be parsed
  321.      * @param delim  the field delimiter character
  322.      * @param quote  the field quoted string character
  323.      */
  324.     public StrTokenizer(final String input, final char delim, final char quote) {
  325.         this(input, delim);
  326.         setQuoteChar(quote);
  327.     }

  328.     /**
  329.      * Constructs a tokenizer splitting on the specified delimiter string.
  330.      *
  331.      * @param input  the string which is to be parsed
  332.      * @param delim  the field delimiter string
  333.      */
  334.     public StrTokenizer(final String input, final String delim) {
  335.         this(input);
  336.         setDelimiterString(delim);
  337.     }

  338.     /**
  339.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  340.      *
  341.      * @param input  the string which is to be parsed
  342.      * @param delim  the field delimiter matcher
  343.      */
  344.     public StrTokenizer(final String input, final StrMatcher delim) {
  345.         this(input);
  346.         setDelimiterMatcher(delim);
  347.     }

  348.     /**
  349.      * Constructs a tokenizer splitting using the specified delimiter matcher
  350.      * and handling quotes using the specified quote matcher.
  351.      *
  352.      * @param input  the string which is to be parsed
  353.      * @param delim  the field delimiter matcher
  354.      * @param quote  the field quoted string matcher
  355.      */
  356.     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
  357.         this(input, delim);
  358.         setQuoteMatcher(quote);
  359.     }

  360.     /**
  361.      * Unsupported ListIterator operation.
  362.      * @param obj this parameter ignored.
  363.      * @throws UnsupportedOperationException always
  364.      */
  365.     @Override
  366.     public void add(final String obj) {
  367.         throw new UnsupportedOperationException("add() is unsupported");
  368.     }

  369.     /**
  370.      * Adds a token to a list, paying attention to the parameters we've set.
  371.      *
  372.      * @param list  the list to add to
  373.      * @param tok  the token to add
  374.      */
  375.     private void addToken(final List<String> list, String tok) {
  376.         if (tok == null || tok.isEmpty()) {
  377.             if (isIgnoreEmptyTokens()) {
  378.                 return;
  379.             }
  380.             if (isEmptyTokenAsNull()) {
  381.                 tok = null;
  382.             }
  383.         }
  384.         list.add(tok);
  385.     }

  386.     /**
  387.      * Checks if tokenization has been done, and if not then do it.
  388.      */
  389.     private void checkTokenized() {
  390.         if (tokens == null) {
  391.             if (chars == null) {
  392.                 // still call tokenize as subclass may do some work
  393.                 final List<String> split = tokenize(null, 0, 0);
  394.                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
  395.             } else {
  396.                 final List<String> split = tokenize(chars, 0, chars.length);
  397.                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
  398.             }
  399.         }
  400.     }

  401.     /**
  402.      * Creates a new instance of this Tokenizer. The new instance is reset so
  403.      * that it will be at the start of the token list.
  404.      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
  405.      *
  406.      * @return a new instance of this Tokenizer which has been reset.
  407.      */
  408.     @Override
  409.     public Object clone() {
  410.         try {
  411.             return cloneReset();
  412.         } catch (final CloneNotSupportedException ex) {
  413.             return null;
  414.         }
  415.     }

  416.     /**
  417.      * Creates a new instance of this Tokenizer. The new instance is reset so that
  418.      * it will be at the start of the token list.
  419.      *
  420.      * @return a new instance of this Tokenizer which has been reset.
  421.      * @throws CloneNotSupportedException if there is a problem cloning
  422.      */
  423.     Object cloneReset() throws CloneNotSupportedException {
  424.         // this method exists to enable 100% test coverage
  425.         final StrTokenizer cloned = (StrTokenizer) super.clone();
  426.         if (cloned.chars != null) {
  427.             cloned.chars = cloned.chars.clone();
  428.         }
  429.         cloned.reset();
  430.         return cloned;
  431.     }

  432.     /**
  433.      * Gets the String content that the tokenizer is parsing.
  434.      *
  435.      * @return The string content being parsed
  436.      */
  437.     public String getContent() {
  438.         if (chars == null) {
  439.             return null;
  440.         }
  441.         return new String(chars);
  442.     }

  443.     /**
  444.      * Gets the field delimiter matcher.
  445.      *
  446.      * @return The delimiter matcher in use
  447.      */
  448.     public StrMatcher getDelimiterMatcher() {
  449.         return this.delimMatcher;
  450.     }

  451.     /**
  452.      * Gets the ignored character matcher.
  453.      * <p>
  454.      * These characters are ignored when parsing the String, unless they are
  455.      * within a quoted region.
  456.      * The default value is not to ignore anything.
  457.      * </p>
  458.      *
  459.      * @return The ignored matcher in use
  460.      */
  461.     public StrMatcher getIgnoredMatcher() {
  462.         return ignoredMatcher;
  463.     }

  464.     /**
  465.      * Gets the quote matcher currently in use.
  466.      * <p>
  467.      * The quote character is used to wrap data between the tokens.
  468.      * This enables delimiters to be entered as data.
  469.      * The default value is '"' (double quote).
  470.      * </p>
  471.      *
  472.      * @return The quote matcher in use
  473.      */
  474.     public StrMatcher getQuoteMatcher() {
  475.         return quoteMatcher;
  476.     }

  477.     /**
  478.      * Gets a copy of the full token list as an independent modifiable array.
  479.      *
  480.      * @return The tokens as a String array
  481.      */
  482.     public String[] getTokenArray() {
  483.         checkTokenized();
  484.         return tokens.clone();
  485.     }

  486.     /**
  487.      * Gets a copy of the full token list as an independent modifiable list.
  488.      *
  489.      * @return The tokens as a String array
  490.      */
  491.     public List<String> getTokenList() {
  492.         checkTokenized();
  493.         final List<String> list = new ArrayList<>(tokens.length);
  494.         Collections.addAll(list, tokens);

  495.         return list;
  496.     }

  497.     /**
  498.      * Gets the trimmer character matcher.
  499.      * <p>
  500.      * These characters are trimmed off on each side of the delimiter
  501.      * until the token or quote is found.
  502.      * The default value is not to trim anything.
  503.      * </p>
  504.      *
  505.      * @return The trimmer matcher in use
  506.      */
  507.     public StrMatcher getTrimmerMatcher() {
  508.         return trimmerMatcher;
  509.     }

  510.     /**
  511.      * Checks whether there are any more tokens.
  512.      *
  513.      * @return true if there are more tokens
  514.      */
  515.     @Override
  516.     public boolean hasNext() {
  517.         checkTokenized();
  518.         return tokenPos < tokens.length;
  519.     }

  520.     /**
  521.      * Checks whether there are any previous tokens that can be iterated to.
  522.      *
  523.      * @return true if there are previous tokens
  524.      */
  525.     @Override
  526.     public boolean hasPrevious() {
  527.         checkTokenized();
  528.         return tokenPos > 0;
  529.     }

  530.     /**
  531.      * Gets whether the tokenizer currently returns empty tokens as null.
  532.      * The default for this property is false.
  533.      *
  534.      * @return true if empty tokens are returned as null
  535.      */
  536.     public boolean isEmptyTokenAsNull() {
  537.         return this.emptyAsNull;
  538.     }

  539.     /**
  540.      * Gets whether the tokenizer currently ignores empty tokens.
  541.      * The default for this property is true.
  542.      *
  543.      * @return true if empty tokens are not returned
  544.      */
  545.     public boolean isIgnoreEmptyTokens() {
  546.         return ignoreEmptyTokens;
  547.     }

  548.     /**
  549.      * Checks if the characters at the index specified match the quote
  550.      * already matched in readNextToken().
  551.      *
  552.      * @param srcChars  the character array being tokenized
  553.      * @param pos  the position to check for a quote
  554.      * @param len  the length of the character array being tokenized
  555.      * @param quoteStart  the start position of the matched quote, 0 if no quoting
  556.      * @param quoteLen  the length of the matched quote, 0 if no quoting
  557.      * @return true if a quote is matched
  558.      */
  559.     private boolean isQuote(final char[] srcChars,
  560.                             final int pos,
  561.                             final int len,
  562.                             final int quoteStart,
  563.                             final int quoteLen) {
  564.         for (int i = 0; i < quoteLen; i++) {
  565.             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
  566.                 return false;
  567.             }
  568.         }
  569.         return true;
  570.     }

  571.     /**
  572.      * Gets the next token.
  573.      *
  574.      * @return The next String token
  575.      * @throws NoSuchElementException if there are no more elements
  576.      */
  577.     @Override
  578.     public String next() {
  579.         if (hasNext()) {
  580.             return tokens[tokenPos++];
  581.         }
  582.         throw new NoSuchElementException();
  583.     }

  584.     /**
  585.      * Gets the index of the next token to return.
  586.      *
  587.      * @return The next token index
  588.      */
  589.     @Override
  590.     public int nextIndex() {
  591.         return tokenPos;
  592.     }

  593.     /**
  594.      * Gets the next token from the String.
  595.      * Equivalent to {@link #next()} except it returns null rather than
  596.      * throwing {@link NoSuchElementException} when no tokens remain.
  597.      *
  598.      * @return The next sequential token, or null when no more tokens are found
  599.      */
  600.     public String nextToken() {
  601.         if (hasNext()) {
  602.             return tokens[tokenPos++];
  603.         }
  604.         return null;
  605.     }

  606.     /**
  607.      * Gets the token previous to the last returned token.
  608.      *
  609.      * @return The previous token
  610.      */
  611.     @Override
  612.     public String previous() {
  613.         if (hasPrevious()) {
  614.             return tokens[--tokenPos];
  615.         }
  616.         throw new NoSuchElementException();
  617.     }

  618.     /**
  619.      * Gets the index of the previous token.
  620.      *
  621.      * @return The previous token index
  622.      */
  623.     @Override
  624.     public int previousIndex() {
  625.         return tokenPos - 1;
  626.     }

  627.     /**
  628.      * Gets the previous token from the String.
  629.      *
  630.      * @return The previous sequential token, or null when no more tokens are found
  631.      */
  632.     public String previousToken() {
  633.         if (hasPrevious()) {
  634.             return tokens[--tokenPos];
  635.         }
  636.         return null;
  637.     }

  638.     /**
  639.      * Reads character by character through the String to get the next token.
  640.      *
  641.      * @param srcChars  the character array being tokenized
  642.      * @param start  the first character of field
  643.      * @param len  the length of the character array being tokenized
  644.      * @param workArea  a temporary work area
  645.      * @param tokenList  the list of parsed tokens
  646.      * @return The starting position of the next field (the character
  647.      *  immediately after the delimiter), or -1 if end of string found
  648.      */
  649.     private int readNextToken(final char[] srcChars,
  650.                               int start,
  651.                               final int len,
  652.                               final StrBuilder workArea,
  653.                               final List<String> tokenList) {
  654.         // skip all leading whitespace, unless it is the
  655.         // field delimiter or the quote character
  656.         while (start < len) {
  657.             final int removeLen = Math.max(
  658.                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
  659.                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
  660.             if (removeLen == 0
  661.                     || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
  662.                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
  663.                 break;
  664.             }
  665.             start += removeLen;
  666.         }

  667.         // handle reaching end
  668.         if (start >= len) {
  669.             addToken(tokenList, StringUtils.EMPTY);
  670.             return -1;
  671.         }

  672.         // handle empty token
  673.         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
  674.         if (delimLen > 0) {
  675.             addToken(tokenList, StringUtils.EMPTY);
  676.             return start + delimLen;
  677.         }

  678.         // handle found token
  679.         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
  680.         if (quoteLen > 0) {
  681.             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
  682.         }
  683.         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
  684.     }

  685.     /**
  686.      * Reads a possibly quoted string token.
  687.      *
  688.      * @param srcChars  the character array being tokenized
  689.      * @param start  the first character of field
  690.      * @param len  the length of the character array being tokenized
  691.      * @param workArea  a temporary work area
  692.      * @param tokenList  the list of parsed tokens
  693.      * @param quoteStart  the start position of the matched quote, 0 if no quoting
  694.      * @param quoteLen  the length of the matched quote, 0 if no quoting
  695.      * @return The starting position of the next field (the character
  696.      *  immediately after the delimiter, or if end of string found,
  697.      *  then the length of string
  698.      */
  699.     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
  700.                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
  701.         // Loop until we've found the end of the quoted
  702.         // string or the end of the input
  703.         workArea.clear();
  704.         int pos = start;
  705.         boolean quoting = quoteLen > 0;
  706.         int trimStart = 0;

  707.         while (pos < len) {
  708.             // quoting mode can occur several times throughout a string
  709.             // we must switch between quoting and non-quoting until we
  710.             // encounter a non-quoted delimiter, or end of string
  711.             if (quoting) {
  712.                 // In quoting mode

  713.                 // If we've found a quote character, see if it's
  714.                 // followed by a second quote.  If so, then we need
  715.                 // to actually put the quote character into the token
  716.                 // rather than end the token.
  717.                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  718.                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
  719.                         // matched pair of quotes, thus an escaped quote
  720.                         workArea.append(srcChars, pos, quoteLen);
  721.                         pos += quoteLen * 2;
  722.                         trimStart = workArea.size();
  723.                         continue;
  724.                     }

  725.                     // end of quoting
  726.                     quoting = false;
  727.                     pos += quoteLen;
  728.                     continue;
  729.                 }

  730.             } else {
  731.                 // Not in quoting mode

  732.                 // check for delimiter, and thus end of token
  733.                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
  734.                 if (delimLen > 0) {
  735.                     // return condition when end of token found
  736.                     addToken(tokenList, workArea.substring(0, trimStart));
  737.                     return pos + delimLen;
  738.                 }

  739.                 // check for quote, and thus back into quoting mode
  740.                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  741.                     quoting = true;
  742.                     pos += quoteLen;
  743.                     continue;
  744.                 }

  745.                 // check for ignored (outside quotes), and ignore
  746.                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
  747.                 if (ignoredLen > 0) {
  748.                     pos += ignoredLen;
  749.                     continue;
  750.                 }

  751.                 // check for trimmed character
  752.                 // don't yet know if its at the end, so copy to workArea
  753.                 // use trimStart to keep track of trim at the end
  754.                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
  755.                 if (trimmedLen > 0) {
  756.                     workArea.append(srcChars, pos, trimmedLen);
  757.                     pos += trimmedLen;
  758.                     continue;
  759.                 }

  760.             }
  761.             // copy regular character from inside quotes
  762.             workArea.append(srcChars[pos++]);
  763.             trimStart = workArea.size();
  764.         }

  765.         // return condition when end of string found
  766.         addToken(tokenList, workArea.substring(0, trimStart));
  767.         return -1;
  768.     }

  769.     /**
  770.      * Unsupported ListIterator operation.
  771.      *
  772.      * @throws UnsupportedOperationException always
  773.      */
  774.     @Override
  775.     public void remove() {
  776.         throw new UnsupportedOperationException("remove() is unsupported");
  777.     }

  778.     /**
  779.      * Resets this tokenizer, forgetting all parsing and iteration already completed.
  780.      * <p>
  781.      * This method allows the same tokenizer to be reused for the same String.
  782.      *
  783.      * @return this, to enable chaining
  784.      */
  785.     public StrTokenizer reset() {
  786.         tokenPos = 0;
  787.         tokens = null;
  788.         return this;
  789.     }

  790.     /**
  791.      * Reset this tokenizer, giving it a new input string to parse.
  792.      * In this manner you can re-use a tokenizer with the same settings
  793.      * on multiple input lines.
  794.      *
  795.      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
  796.      * @return this, to enable chaining
  797.      */
  798.     public StrTokenizer reset(final char[] input) {
  799.         reset();
  800.         if (input != null) {
  801.             this.chars = input.clone();
  802.         } else {
  803.             this.chars = null;
  804.         }
  805.         return this;
  806.     }

  807.     /**
  808.      * Reset this tokenizer, giving it a new input string to parse.
  809.      * In this manner you can re-use a tokenizer with the same settings
  810.      * on multiple input lines.
  811.      *
  812.      * @param input  the new string to tokenize, null sets no text to parse
  813.      * @return this, to enable chaining
  814.      */
  815.     public StrTokenizer reset(final String input) {
  816.         reset();
  817.         if (input != null) {
  818.             this.chars = input.toCharArray();
  819.         } else {
  820.             this.chars = null;
  821.         }
  822.         return this;
  823.     }

  824.     /**
  825.      * Unsupported ListIterator operation.
  826.      * @param obj this parameter ignored.
  827.      * @throws UnsupportedOperationException always
  828.      */
  829.     @Override
  830.     public void set(final String obj) {
  831.         throw new UnsupportedOperationException("set() is unsupported");
  832.     }

  833.     /**
  834.      * Sets the field delimiter character.
  835.      *
  836.      * @param delim  the delimiter character to use
  837.      * @return this, to enable chaining
  838.      */
  839.     public StrTokenizer setDelimiterChar(final char delim) {
  840.         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
  841.     }

  842.     /**
  843.      * Sets the field delimiter matcher.
  844.      * <p>
  845.      * The delimiter is used to separate one token from another.
  846.      * </p>
  847.      *
  848.      * @param delim  the delimiter matcher to use
  849.      * @return this, to enable chaining
  850.      */
  851.     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
  852.         if (delim == null) {
  853.             this.delimMatcher = StrMatcher.noneMatcher();
  854.         } else {
  855.             this.delimMatcher = delim;
  856.         }
  857.         return this;
  858.     }

  859.     /**
  860.      * Sets the field delimiter string.
  861.      *
  862.      * @param delim  the delimiter string to use
  863.      * @return this, to enable chaining
  864.      */
  865.     public StrTokenizer setDelimiterString(final String delim) {
  866.         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
  867.     }

  868.     /**
  869.      * Sets whether the tokenizer should return empty tokens as null.
  870.      * The default for this property is false.
  871.      *
  872.      * @param emptyAsNull  whether empty tokens are returned as null
  873.      * @return this, to enable chaining
  874.      */
  875.     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
  876.         this.emptyAsNull = emptyAsNull;
  877.         return this;
  878.     }

  879.     /**
  880.      * Sets the character to ignore.
  881.      * <p>
  882.      * This character is ignored when parsing the String, unless it is
  883.      * within a quoted region.
  884.      * </p>
  885.      *
  886.      * @param ignored  the ignored character to use
  887.      * @return this, to enable chaining
  888.      */
  889.     public StrTokenizer setIgnoredChar(final char ignored) {
  890.         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
  891.     }

  892.     /**
  893.      * Sets the matcher for characters to ignore.
  894.      * <p>
  895.      * These characters are ignored when parsing the String, unless they are
  896.      * within a quoted region.
  897.      * </p>
  898.      *
  899.      * @param ignored  the ignored matcher to use, null ignored
  900.      * @return this, to enable chaining
  901.      */
  902.     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
  903.         if (ignored != null) {
  904.             this.ignoredMatcher = ignored;
  905.         }
  906.         return this;
  907.     }

  908.     /**
  909.      * Sets whether the tokenizer should ignore and not return empty tokens.
  910.      * The default for this property is true.
  911.      *
  912.      * @param ignoreEmptyTokens  whether empty tokens are not returned
  913.      * @return this, to enable chaining
  914.      */
  915.     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
  916.         this.ignoreEmptyTokens = ignoreEmptyTokens;
  917.         return this;
  918.     }

  919.     /**
  920.      * Sets the quote character to use.
  921.      * <p>
  922.      * The quote character is used to wrap data between the tokens.
  923.      * This enables delimiters to be entered as data.
  924.      * </p>
  925.      *
  926.      * @param quote  the quote character to use
  927.      * @return this, to enable chaining
  928.      */
  929.     public StrTokenizer setQuoteChar(final char quote) {
  930.         return setQuoteMatcher(StrMatcher.charMatcher(quote));
  931.     }

  932.     /**
  933.      * Sets the quote matcher to use.
  934.      * <p>
  935.      * The quote character is used to wrap data between the tokens.
  936.      * This enables delimiters to be entered as data.
  937.      * </p>
  938.      *
  939.      * @param quote  the quote matcher to use, null ignored
  940.      * @return this, to enable chaining
  941.      */
  942.     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
  943.         if (quote != null) {
  944.             this.quoteMatcher = quote;
  945.         }
  946.         return this;
  947.     }

  948.     /**
  949.      * Sets the matcher for characters to trim.
  950.      * <p>
  951.      * These characters are trimmed off on each side of the delimiter
  952.      * until the token or quote is found.
  953.      * </p>
  954.      *
  955.      * @param trimmer  the trimmer matcher to use, null ignored
  956.      * @return this, to enable chaining
  957.      */
  958.     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
  959.         if (trimmer != null) {
  960.             this.trimmerMatcher = trimmer;
  961.         }
  962.         return this;
  963.     }

  964.     /**
  965.      * Gets the number of tokens found in the String.
  966.      *
  967.      * @return The number of matched tokens
  968.      */
  969.     public int size() {
  970.         checkTokenized();
  971.         return tokens.length;
  972.     }

  973.     /**
  974.      * Internal method to performs the tokenization.
  975.      * <p>
  976.      * Most users of this class do not need to call this method. This method
  977.      * will be called automatically by other (public) methods when required.
  978.      * </p>
  979.      * <p>
  980.      * This method exists to allow subclasses to add code before or after the
  981.      * tokenization. For example, a subclass could alter the character array,
  982.      * offset or count to be parsed, or call the tokenizer multiple times on
  983.      * multiple strings. It is also be possible to filter the results.
  984.      * </p>
  985.      * <p>
  986.      * {@code StrTokenizer} will always pass a zero offset and a count
  987.      * equal to the length of the array to this method, however a subclass
  988.      * may pass other values, or even an entirely different array.
  989.      * </p>
  990.      *
  991.      * @param srcChars  the character array being tokenized, may be null
  992.      * @param offset  the start position within the character array, must be valid
  993.      * @param count  the number of characters to tokenize, must be valid
  994.      * @return The modifiable list of String tokens, unmodifiable if null array or zero count
  995.      */
  996.     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
  997.         if (srcChars == null || count == 0) {
  998.             return Collections.emptyList();
  999.         }
  1000.         final StrBuilder buf = new StrBuilder();
  1001.         final List<String> tokenList = new ArrayList<>();
  1002.         int pos = offset;

  1003.         // loop around the entire buffer
  1004.         while (pos >= 0 && pos < count) {
  1005.             // find next token
  1006.             pos = readNextToken(srcChars, pos, count, buf, tokenList);

  1007.             // handle case where end of string is a delimiter
  1008.             if (pos >= count) {
  1009.                 addToken(tokenList, StringUtils.EMPTY);
  1010.             }
  1011.         }
  1012.         return tokenList;
  1013.     }

  1014.     /**
  1015.      * Gets the String content that the tokenizer is parsing.
  1016.      *
  1017.      * @return The string content being parsed
  1018.      */
  1019.     @Override
  1020.     public String toString() {
  1021.         if (tokens == null) {
  1022.             return "StrTokenizer[not tokenized yet]";
  1023.         }
  1024.         return "StrTokenizer" + getTokenList();
  1025.     }

  1026. }