StrTokenizer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text;

  18. import java.util.ArrayList;
  19. import java.util.Collections;
  20. import java.util.List;
  21. import java.util.ListIterator;
  22. import java.util.NoSuchElementException;

  23. /**
  24.  * Tokenizes a string based based on delimiters (separators)
  25.  * and supporting quoting and ignored character concepts.
  26.  * <p>
  27.  * This class can split a String into many smaller strings. It aims
  28.  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
  29.  * however it offers much more control and flexibility including implementing
  30.  * the <code>ListIterator</code> interface. By default, it is set up
  31.  * like <code>StringTokenizer</code>.
  32.  * <p>
  33.  * The input String is split into a number of <i>tokens</i>.
  34.  * Each token is separated from the next String by a <i>delimiter</i>.
  35.  * One or more delimiter characters must be specified.
  36.  * <p>
  37.  * Each token may be surrounded by quotes.
  38.  * The <i>quote</i> matcher specifies the quote character(s).
  39.  * A quote may be escaped within a quoted section by duplicating itself.
  40.  * <p>
  41.  * Between each token and the delimiter are potentially characters that need trimming.
  42.  * The <i>trimmer</i> matcher specifies these characters.
  43.  * One usage might be to trim whitespace characters.
  44.  * <p>
  45.  * At any point outside the quotes there might potentially be invalid characters.
  46.  * The <i>ignored</i> matcher specifies these characters to be removed.
  47.  * One usage might be to remove new line characters.
  48.  * <p>
  49.  * Empty tokens may be removed or returned as null.
  50.  * <pre>
  51.  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
  52.  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
  53.  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
  54.  * </pre>
  55.  * <p>
  56.  *
  57.  * This tokenizer has the following properties and options:
  58.  *
  59.  * <table summary="Tokenizer Properties">
  60.  *  <tr>
  61.  *   <th>Property</th><th>Type</th><th>Default</th>
  62.  *  </tr>
  63.  *  <tr>
  64.  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
  65.  *  </tr>
  66.  *  <tr>
  67.  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
  68.  *  </tr>
  69.  *  <tr>
  70.  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
  71.  *  </tr>
  72.  *  <tr>
  73.  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
  74.  *  </tr>
  75.  *  <tr>
  76.  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
  77.  *  </tr>
  78.  * </table>
  79.  *
  80.  * @since 1.0
  81.  */
  82. public class StrTokenizer implements ListIterator<String>, Cloneable {

  83.     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
  84.     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
  85.     static {
  86.         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
  87.         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
  88.         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
  89.         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
  90.         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
  91.         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  92.         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);

  93.         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
  94.         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
  95.         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
  96.         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
  97.         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
  98.         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  99.         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
  100.     }

  101.     /** The text to work on. */
  102.     private char chars[];
  103.     /** The parsed tokens */
  104.     private String tokens[];
  105.     /** The current iteration position */
  106.     private int tokenPos;

  107.     /** The delimiter matcher */
  108.     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
  109.     /** The quote matcher */
  110.     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
  111.     /** The ignored matcher */
  112.     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
  113.     /** The trimmer matcher */
  114.     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();

  115.     /** Whether to return empty tokens as null */
  116.     private boolean emptyAsNull = false;
  117.     /** Whether to ignore empty tokens */
  118.     private boolean ignoreEmptyTokens = true;

  119.     //-----------------------------------------------------------------------

  120.     /**
  121.      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
  122.      *
  123.      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
  124.      */
  125.     private static StrTokenizer getCSVClone() {
  126.         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
  127.     }

  128.     /**
  129.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  130.      * initializing it with the given input.  The default for CSV processing
  131.      * will be trim whitespace from both ends (which can be overridden with
  132.      * the setTrimmer method).
  133.      * <p>
  134.      * You must call a "reset" method to set the string which you want to parse.
  135.      * @return a new tokenizer instance which parses Comma Separated Value strings
  136.      */
  137.     public static StrTokenizer getCSVInstance() {
  138.         return getCSVClone();
  139.     }

  140.     /**
  141.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  142.      * initializing it with the given input.  The default for CSV processing
  143.      * will be trim whitespace from both ends (which can be overridden with
  144.      * the setTrimmer method).
  145.      *
  146.      * @param input  the text to parse
  147.      * @return a new tokenizer instance which parses Comma Separated Value strings
  148.      */
  149.     public static StrTokenizer getCSVInstance(final String input) {
  150.         final StrTokenizer tok = getCSVClone();
  151.         tok.reset(input);
  152.         return tok;
  153.     }

  154.     /**
  155.      * Gets a new tokenizer instance which parses Comma Separated Value strings
  156.      * initializing it with the given input.  The default for CSV processing
  157.      * will be trim whitespace from both ends (which can be overridden with
  158.      * the setTrimmer method).
  159.      *
  160.      * @param input  the text to parse
  161.      * @return a new tokenizer instance which parses Comma Separated Value strings
  162.      */
  163.     public static StrTokenizer getCSVInstance(final char[] input) {
  164.         final StrTokenizer tok = getCSVClone();
  165.         tok.reset(input);
  166.         return tok;
  167.     }

  168.     /**
  169.      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
  170.      *
  171.      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
  172.      */
  173.     private static StrTokenizer getTSVClone() {
  174.         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
  175.     }


  176.     /**
  177.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  178.      * The default for CSV processing will be trim whitespace from both ends
  179.      * (which can be overridden with the setTrimmer method).
  180.      * <p>
  181.      * You must call a "reset" method to set the string which you want to parse.
  182.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  183.      */
  184.     public static StrTokenizer getTSVInstance() {
  185.         return getTSVClone();
  186.     }

  187.     /**
  188.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  189.      * The default for CSV processing will be trim whitespace from both ends
  190.      * (which can be overridden with the setTrimmer method).
  191.      * @param input  the string to parse
  192.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  193.      */
  194.     public static StrTokenizer getTSVInstance(final String input) {
  195.         final StrTokenizer tok = getTSVClone();
  196.         tok.reset(input);
  197.         return tok;
  198.     }

  199.     /**
  200.      * Gets a new tokenizer instance which parses Tab Separated Value strings.
  201.      * The default for CSV processing will be trim whitespace from both ends
  202.      * (which can be overridden with the setTrimmer method).
  203.      * @param input  the string to parse
  204.      * @return a new tokenizer instance which parses Tab Separated Value strings.
  205.      */
  206.     public static StrTokenizer getTSVInstance(final char[] input) {
  207.         final StrTokenizer tok = getTSVClone();
  208.         tok.reset(input);
  209.         return tok;
  210.     }

  211.     //-----------------------------------------------------------------------
  212.     /**
  213.      * Constructs a tokenizer splitting on space, tab, newline and formfeed
  214.      * as per StringTokenizer, but with no text to tokenize.
  215.      * <p>
  216.      * This constructor is normally used with {@link #reset(String)}.
  217.      */
  218.     public StrTokenizer() {
  219.         super();
  220.         this.chars = null;
  221.     }

  222.     /**
  223.      * Constructs a tokenizer splitting on space, tab, newline and formfeed
  224.      * as per StringTokenizer.
  225.      *
  226.      * @param input  the string which is to be parsed
  227.      */
  228.     public StrTokenizer(final String input) {
  229.         super();
  230.         if (input != null) {
  231.             chars = input.toCharArray();
  232.         } else {
  233.             chars = null;
  234.         }
  235.     }

  236.     /**
  237.      * Constructs a tokenizer splitting on the specified delimiter character.
  238.      *
  239.      * @param input  the string which is to be parsed
  240.      * @param delim  the field delimiter character
  241.      */
  242.     public StrTokenizer(final String input, final char delim) {
  243.         this(input);
  244.         setDelimiterChar(delim);
  245.     }

  246.     /**
  247.      * Constructs a tokenizer splitting on the specified delimiter string.
  248.      *
  249.      * @param input  the string which is to be parsed
  250.      * @param delim  the field delimiter string
  251.      */
  252.     public StrTokenizer(final String input, final String delim) {
  253.         this(input);
  254.         setDelimiterString(delim);
  255.     }

  256.     /**
  257.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  258.      *
  259.      * @param input  the string which is to be parsed
  260.      * @param delim  the field delimiter matcher
  261.      */
  262.     public StrTokenizer(final String input, final StrMatcher delim) {
  263.         this(input);
  264.         setDelimiterMatcher(delim);
  265.     }

  266.     /**
  267.      * Constructs a tokenizer splitting on the specified delimiter character
  268.      * and handling quotes using the specified quote character.
  269.      *
  270.      * @param input  the string which is to be parsed
  271.      * @param delim  the field delimiter character
  272.      * @param quote  the field quoted string character
  273.      */
  274.     public StrTokenizer(final String input, final char delim, final char quote) {
  275.         this(input, delim);
  276.         setQuoteChar(quote);
  277.     }

  278.     /**
  279.      * Constructs a tokenizer splitting using the specified delimiter matcher
  280.      * and handling quotes using the specified quote matcher.
  281.      *
  282.      * @param input  the string which is to be parsed
  283.      * @param delim  the field delimiter matcher
  284.      * @param quote  the field quoted string matcher
  285.      */
  286.     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
  287.         this(input, delim);
  288.         setQuoteMatcher(quote);
  289.     }

  290.     /**
  291.      * Constructs a tokenizer splitting on space, tab, newline and formfeed
  292.      * as per StringTokenizer.
  293.      *
  294.      * @param input  the string which is to be parsed, not cloned
  295.      */
  296.     public StrTokenizer(final char[] input) {
  297.         super();
  298.         if (input == null) {
  299.             this.chars = null;
  300.         } else {
  301.             this.chars = input.clone();
  302.         }
  303.     }

  304.     /**
  305.      * Constructs a tokenizer splitting on the specified character.
  306.      *
  307.      * @param input  the string which is to be parsed, not cloned
  308.      * @param delim the field delimiter character
  309.      */
  310.     public StrTokenizer(final char[] input, final char delim) {
  311.         this(input);
  312.         setDelimiterChar(delim);
  313.     }

  314.     /**
  315.      * Constructs a tokenizer splitting on the specified string.
  316.      *
  317.      * @param input  the string which is to be parsed, not cloned
  318.      * @param delim the field delimiter string
  319.      */
  320.     public StrTokenizer(final char[] input, final String delim) {
  321.         this(input);
  322.         setDelimiterString(delim);
  323.     }

  324.     /**
  325.      * Constructs a tokenizer splitting using the specified delimiter matcher.
  326.      *
  327.      * @param input  the string which is to be parsed, not cloned
  328.      * @param delim  the field delimiter matcher
  329.      */
  330.     public StrTokenizer(final char[] input, final StrMatcher delim) {
  331.         this(input);
  332.         setDelimiterMatcher(delim);
  333.     }

  334.     /**
  335.      * Constructs a tokenizer splitting on the specified delimiter character
  336.      * and handling quotes using the specified quote character.
  337.      *
  338.      * @param input  the string which is to be parsed, not cloned
  339.      * @param delim  the field delimiter character
  340.      * @param quote  the field quoted string character
  341.      */
  342.     public StrTokenizer(final char[] input, final char delim, final char quote) {
  343.         this(input, delim);
  344.         setQuoteChar(quote);
  345.     }

  346.     /**
  347.      * Constructs a tokenizer splitting using the specified delimiter matcher
  348.      * and handling quotes using the specified quote matcher.
  349.      *
  350.      * @param input  the string which is to be parsed, not cloned
  351.      * @param delim  the field delimiter character
  352.      * @param quote  the field quoted string character
  353.      */
  354.     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
  355.         this(input, delim);
  356.         setQuoteMatcher(quote);
  357.     }

  358.     // API
  359.     //-----------------------------------------------------------------------
  360.     /**
  361.      * Gets the number of tokens found in the String.
  362.      *
  363.      * @return the number of matched tokens
  364.      */
  365.     public int size() {
  366.         checkTokenized();
  367.         return tokens.length;
  368.     }

  369.     /**
  370.      * Gets the next token from the String.
  371.      * Equivalent to {@link #next()} except it returns null rather than
  372.      * throwing {@link NoSuchElementException} when no tokens remain.
  373.      *
  374.      * @return the next sequential token, or null when no more tokens are found
  375.      */
  376.     public String nextToken() {
  377.         if (hasNext()) {
  378.             return tokens[tokenPos++];
  379.         }
  380.         return null;
  381.     }

  382.     /**
  383.      * Gets the previous token from the String.
  384.      *
  385.      * @return the previous sequential token, or null when no more tokens are found
  386.      */
  387.     public String previousToken() {
  388.         if (hasPrevious()) {
  389.             return tokens[--tokenPos];
  390.         }
  391.         return null;
  392.     }

  393.     /**
  394.      * Gets a copy of the full token list as an independent modifiable array.
  395.      *
  396.      * @return the tokens as a String array
  397.      */
  398.     public String[] getTokenArray() {
  399.         checkTokenized();
  400.         return tokens.clone();
  401.     }

  402.     /**
  403.      * Gets a copy of the full token list as an independent modifiable list.
  404.      *
  405.      * @return the tokens as a String array
  406.      */
  407.     public List<String> getTokenList() {
  408.         checkTokenized();
  409.         final List<String> list = new ArrayList<>(tokens.length);
  410.         for (final String element : tokens) {
  411.             list.add(element);
  412.         }
  413.         return list;
  414.     }

  415.     /**
  416.      * Resets this tokenizer, forgetting all parsing and iteration already completed.
  417.      * <p>
  418.      * This method allows the same tokenizer to be reused for the same String.
  419.      *
  420.      * @return this, to enable chaining
  421.      */
  422.     public org.apache.commons.text.StrTokenizer reset() {
  423.         tokenPos = 0;
  424.         tokens = null;
  425.         return this;
  426.     }

  427.     /**
  428.      * Reset this tokenizer, giving it a new input string to parse.
  429.      * In this manner you can re-use a tokenizer with the same settings
  430.      * on multiple input lines.
  431.      *
  432.      * @param input  the new string to tokenize, null sets no text to parse
  433.      * @return this, to enable chaining
  434.      */
  435.     public org.apache.commons.text.StrTokenizer reset(final String input) {
  436.         reset();
  437.         if (input != null) {
  438.             this.chars = input.toCharArray();
  439.         } else {
  440.             this.chars = null;
  441.         }
  442.         return this;
  443.     }

  444.     /**
  445.      * Reset this tokenizer, giving it a new input string to parse.
  446.      * In this manner you can re-use a tokenizer with the same settings
  447.      * on multiple input lines.
  448.      *
  449.      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
  450.      * @return this, to enable chaining
  451.      */
  452.     public org.apache.commons.text.StrTokenizer reset(final char[] input) {
  453.         reset();
  454.         if (input != null) {
  455.             this.chars = input;
  456.         } else {
  457.             this.chars = null;
  458.         }
  459.         return this;
  460.     }

  461.     // ListIterator
  462.     //-----------------------------------------------------------------------
  463.     /**
  464.      * Checks whether there are any more tokens.
  465.      *
  466.      * @return true if there are more tokens
  467.      */
  468.     @Override
  469.     public boolean hasNext() {
  470.         checkTokenized();
  471.         return tokenPos < tokens.length;
  472.     }

  473.     /**
  474.      * Gets the next token.
  475.      *
  476.      * @return the next String token
  477.      * @throws NoSuchElementException if there are no more elements
  478.      */
  479.     @Override
  480.     public String next() {
  481.         if (hasNext()) {
  482.             return tokens[tokenPos++];
  483.         }
  484.         throw new NoSuchElementException();
  485.     }

  486.     /**
  487.      * Gets the index of the next token to return.
  488.      *
  489.      * @return the next token index
  490.      */
  491.     @Override
  492.     public int nextIndex() {
  493.         return tokenPos;
  494.     }

  495.     /**
  496.      * Checks whether there are any previous tokens that can be iterated to.
  497.      *
  498.      * @return true if there are previous tokens
  499.      */
  500.     @Override
  501.     public boolean hasPrevious() {
  502.         checkTokenized();
  503.         return tokenPos > 0;
  504.     }

  505.     /**
  506.      * Gets the token previous to the last returned token.
  507.      *
  508.      * @return the previous token
  509.      */
  510.     @Override
  511.     public String previous() {
  512.         if (hasPrevious()) {
  513.             return tokens[--tokenPos];
  514.         }
  515.         throw new NoSuchElementException();
  516.     }

  517.     /**
  518.      * Gets the index of the previous token.
  519.      *
  520.      * @return the previous token index
  521.      */
  522.     @Override
  523.     public int previousIndex() {
  524.         return tokenPos - 1;
  525.     }

  526.     /**
  527.      * Unsupported ListIterator operation.
  528.      *
  529.      * @throws UnsupportedOperationException always
  530.      */
  531.     @Override
  532.     public void remove() {
  533.         throw new UnsupportedOperationException("remove() is unsupported");
  534.     }

  535.     /**
  536.      * Unsupported ListIterator operation.
  537.      * @param obj this parameter ignored.
  538.      * @throws UnsupportedOperationException always
  539.      */
  540.     @Override
  541.     public void set(final String obj) {
  542.         throw new UnsupportedOperationException("set() is unsupported");
  543.     }

  544.     /**
  545.      * Unsupported ListIterator operation.
  546.      * @param obj this parameter ignored.
  547.      * @throws UnsupportedOperationException always
  548.      */
  549.     @Override
  550.     public void add(final String obj) {
  551.         throw new UnsupportedOperationException("add() is unsupported");
  552.     }

  553.     // Implementation
  554.     //-----------------------------------------------------------------------
  555.     /**
  556.      * Checks if tokenization has been done, and if not then do it.
  557.      */
  558.     private void checkTokenized() {
  559.         if (tokens == null) {
  560.             if (chars == null) {
  561.                 // still call tokenize as subclass may do some work
  562.                 final List<String> split = tokenize(null, 0, 0);
  563.                 tokens = split.toArray(new String[split.size()]);
  564.             } else {
  565.                 final List<String> split = tokenize(chars, 0, chars.length);
  566.                 tokens = split.toArray(new String[split.size()]);
  567.             }
  568.         }
  569.     }

  570.     /**
  571.      * Internal method to performs the tokenization.
  572.      * <p>
  573.      * Most users of this class do not need to call this method. This method
  574.      * will be called automatically by other (public) methods when required.
  575.      * <p>
  576.      * This method exists to allow subclasses to add code before or after the
  577.      * tokenization. For example, a subclass could alter the character array,
  578.      * offset or count to be parsed, or call the tokenizer multiple times on
  579.      * multiple strings. It is also be possible to filter the results.
  580.      * <p>
  581.      * <code>StrTokenizer</code> will always pass a zero offset and a count
  582.      * equal to the length of the array to this method, however a subclass
  583.      * may pass other values, or even an entirely different array.
  584.      *
  585.      * @param srcChars  the character array being tokenized, may be null
  586.      * @param offset  the start position within the character array, must be valid
  587.      * @param count  the number of characters to tokenize, must be valid
  588.      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
  589.      */
  590.     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
  591.         if (srcChars == null || count == 0) {
  592.             return Collections.emptyList();
  593.         }
  594.         final StrBuilder buf = new StrBuilder();
  595.         final List<String> tokenList = new ArrayList<>();
  596.         int pos = offset;

  597.         // loop around the entire buffer
  598.         while (pos >= 0 && pos < count) {
  599.             // find next token
  600.             pos = readNextToken(srcChars, pos, count, buf, tokenList);

  601.             // handle case where end of string is a delimiter
  602.             if (pos >= count) {
  603.                 addToken(tokenList, "");
  604.             }
  605.         }
  606.         return tokenList;
  607.     }

  608.     /**
  609.      * Adds a token to a list, paying attention to the parameters we've set.
  610.      *
  611.      * @param list  the list to add to
  612.      * @param tok  the token to add
  613.      */
  614.     private void addToken(final List<String> list, String tok) {
  615.         if (tok == null || tok.length() == 0) {
  616.             if (isIgnoreEmptyTokens()) {
  617.                 return;
  618.             }
  619.             if (isEmptyTokenAsNull()) {
  620.                 tok = null;
  621.             }
  622.         }
  623.         list.add(tok);
  624.     }

  625.     /**
  626.      * Reads character by character through the String to get the next token.
  627.      *
  628.      * @param srcChars  the character array being tokenized
  629.      * @param start  the first character of field
  630.      * @param len  the length of the character array being tokenized
  631.      * @param workArea  a temporary work area
  632.      * @param tokenList  the list of parsed tokens
  633.      * @return the starting position of the next field (the character
  634.      *  immediately after the delimiter), or -1 if end of string found
  635.      */
  636.     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
  637.         // skip all leading whitespace, unless it is the
  638.         // field delimiter or the quote character
  639.         while (start < len) {
  640.             final int removeLen = Math.max(
  641.                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
  642.                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
  643.             if (removeLen == 0 ||
  644.                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
  645.                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
  646.                 break;
  647.             }
  648.             start += removeLen;
  649.         }

  650.         // handle reaching end
  651.         if (start >= len) {
  652.             addToken(tokenList, "");
  653.             return -1;
  654.         }

  655.         // handle empty token
  656.         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
  657.         if (delimLen > 0) {
  658.             addToken(tokenList, "");
  659.             return start + delimLen;
  660.         }

  661.         // handle found token
  662.         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
  663.         if (quoteLen > 0) {
  664.             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
  665.         }
  666.         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
  667.     }

  668.     /**
  669.      * Reads a possibly quoted string token.
  670.      *
  671.      * @param srcChars  the character array being tokenized
  672.      * @param start  the first character of field
  673.      * @param len  the length of the character array being tokenized
  674.      * @param workArea  a temporary work area
  675.      * @param tokenList  the list of parsed tokens
  676.      * @param quoteStart  the start position of the matched quote, 0 if no quoting
  677.      * @param quoteLen  the length of the matched quote, 0 if no quoting
  678.      * @return the starting position of the next field (the character
  679.      *  immediately after the delimiter, or if end of string found,
  680.      *  then the length of string
  681.      */
  682.     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
  683.                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
  684.         // Loop until we've found the end of the quoted
  685.         // string or the end of the input
  686.         workArea.clear();
  687.         int pos = start;
  688.         boolean quoting = quoteLen > 0;
  689.         int trimStart = 0;

  690.         while (pos < len) {
  691.             // quoting mode can occur several times throughout a string
  692.             // we must switch between quoting and non-quoting until we
  693.             // encounter a non-quoted delimiter, or end of string
  694.             if (quoting) {
  695.                 // In quoting mode

  696.                 // If we've found a quote character, see if it's
  697.                 // followed by a second quote.  If so, then we need
  698.                 // to actually put the quote character into the token
  699.                 // rather than end the token.
  700.                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  701.                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
  702.                         // matched pair of quotes, thus an escaped quote
  703.                         workArea.append(srcChars, pos, quoteLen);
  704.                         pos += quoteLen * 2;
  705.                         trimStart = workArea.size();
  706.                         continue;
  707.                     }

  708.                     // end of quoting
  709.                     quoting = false;
  710.                     pos += quoteLen;
  711.                     continue;
  712.                 }

  713.                 // copy regular character from inside quotes
  714.                 workArea.append(srcChars[pos++]);
  715.                 trimStart = workArea.size();

  716.             } else {
  717.                 // Not in quoting mode

  718.                 // check for delimiter, and thus end of token
  719.                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
  720.                 if (delimLen > 0) {
  721.                     // return condition when end of token found
  722.                     addToken(tokenList, workArea.substring(0, trimStart));
  723.                     return pos + delimLen;
  724.                 }

  725.                 // check for quote, and thus back into quoting mode
  726.                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
  727.                     quoting = true;
  728.                     pos += quoteLen;
  729.                     continue;
  730.                 }

  731.                 // check for ignored (outside quotes), and ignore
  732.                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
  733.                 if (ignoredLen > 0) {
  734.                     pos += ignoredLen;
  735.                     continue;
  736.                 }

  737.                 // check for trimmed character
  738.                 // don't yet know if its at the end, so copy to workArea
  739.                 // use trimStart to keep track of trim at the end
  740.                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
  741.                 if (trimmedLen > 0) {
  742.                     workArea.append(srcChars, pos, trimmedLen);
  743.                     pos += trimmedLen;
  744.                     continue;
  745.                 }

  746.                 // copy regular character from outside quotes
  747.                 workArea.append(srcChars[pos++]);
  748.                 trimStart = workArea.size();
  749.             }
  750.         }

  751.         // return condition when end of string found
  752.         addToken(tokenList, workArea.substring(0, trimStart));
  753.         return -1;
  754.     }

  755.     /**
  756.      * Checks if the characters at the index specified match the quote
  757.      * already matched in readNextToken().
  758.      *
  759.      * @param srcChars  the character array being tokenized
  760.      * @param pos  the position to check for a quote
  761.      * @param len  the length of the character array being tokenized
  762.      * @param quoteStart  the start position of the matched quote, 0 if no quoting
  763.      * @param quoteLen  the length of the matched quote, 0 if no quoting
  764.      * @return true if a quote is matched
  765.      */
  766.     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
  767.         for (int i = 0; i < quoteLen; i++) {
  768.             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
  769.                 return false;
  770.             }
  771.         }
  772.         return true;
  773.     }

  774.     // Delimiter
  775.     //-----------------------------------------------------------------------
  776.     /**
  777.      * Gets the field delimiter matcher.
  778.      *
  779.      * @return the delimiter matcher in use
  780.      */
  781.     public StrMatcher getDelimiterMatcher() {
  782.         return this.delimMatcher;
  783.     }

  784.     /**
  785.      * Sets the field delimiter matcher.
  786.      * <p>
  787.      * The delimitier is used to separate one token from another.
  788.      *
  789.      * @param delim  the delimiter matcher to use
  790.      * @return this, to enable chaining
  791.      */
  792.     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
  793.         if (delim == null) {
  794.             this.delimMatcher = StrMatcher.noneMatcher();
  795.         } else {
  796.             this.delimMatcher = delim;
  797.         }
  798.         return this;
  799.     }

  800.     /**
  801.      * Sets the field delimiter character.
  802.      *
  803.      * @param delim  the delimiter character to use
  804.      * @return this, to enable chaining
  805.      */
  806.     public StrTokenizer setDelimiterChar(final char delim) {
  807.         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
  808.     }

  809.     /**
  810.      * Sets the field delimiter string.
  811.      *
  812.      * @param delim  the delimiter string to use
  813.      * @return this, to enable chaining
  814.      */
  815.     public StrTokenizer setDelimiterString(final String delim) {
  816.         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
  817.     }

  818.     // Quote
  819.     //-----------------------------------------------------------------------
  820.     /**
  821.      * Gets the quote matcher currently in use.
  822.      * <p>
  823.      * The quote character is used to wrap data between the tokens.
  824.      * This enables delimiters to be entered as data.
  825.      * The default value is '"' (double quote).
  826.      *
  827.      * @return the quote matcher in use
  828.      */
  829.     public StrMatcher getQuoteMatcher() {
  830.         return quoteMatcher;
  831.     }

  832.     /**
  833.      * Set the quote matcher to use.
  834.      * <p>
  835.      * The quote character is used to wrap data between the tokens.
  836.      * This enables delimiters to be entered as data.
  837.      *
  838.      * @param quote  the quote matcher to use, null ignored
  839.      * @return this, to enable chaining
  840.      */
  841.     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
  842.         if (quote != null) {
  843.             this.quoteMatcher = quote;
  844.         }
  845.         return this;
  846.     }

  847.     /**
  848.      * Sets the quote character to use.
  849.      * <p>
  850.      * The quote character is used to wrap data between the tokens.
  851.      * This enables delimiters to be entered as data.
  852.      *
  853.      * @param quote  the quote character to use
  854.      * @return this, to enable chaining
  855.      */
  856.     public StrTokenizer setQuoteChar(final char quote) {
  857.         return setQuoteMatcher(StrMatcher.charMatcher(quote));
  858.     }

  859.     // Ignored
  860.     //-----------------------------------------------------------------------
  861.     /**
  862.      * Gets the ignored character matcher.
  863.      * <p>
  864.      * These characters are ignored when parsing the String, unless they are
  865.      * within a quoted region.
  866.      * The default value is not to ignore anything.
  867.      *
  868.      * @return the ignored matcher in use
  869.      */
  870.     public StrMatcher getIgnoredMatcher() {
  871.         return ignoredMatcher;
  872.     }

  873.     /**
  874.      * Set the matcher for characters to ignore.
  875.      * <p>
  876.      * These characters are ignored when parsing the String, unless they are
  877.      * within a quoted region.
  878.      *
  879.      * @param ignored  the ignored matcher to use, null ignored
  880.      * @return this, to enable chaining
  881.      */
  882.     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
  883.         if (ignored != null) {
  884.             this.ignoredMatcher = ignored;
  885.         }
  886.         return this;
  887.     }

  888.     /**
  889.      * Set the character to ignore.
  890.      * <p>
  891.      * This character is ignored when parsing the String, unless it is
  892.      * within a quoted region.
  893.      *
  894.      * @param ignored  the ignored character to use
  895.      * @return this, to enable chaining
  896.      */
  897.     public StrTokenizer setIgnoredChar(final char ignored) {
  898.         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
  899.     }

  900.     // Trimmer
  901.     //-----------------------------------------------------------------------
  902.     /**
  903.      * Gets the trimmer character matcher.
  904.      * <p>
  905.      * These characters are trimmed off on each side of the delimiter
  906.      * until the token or quote is found.
  907.      * The default value is not to trim anything.
  908.      *
  909.      * @return the trimmer matcher in use
  910.      */
  911.     public StrMatcher getTrimmerMatcher() {
  912.         return trimmerMatcher;
  913.     }

  914.     /**
  915.      * Sets the matcher for characters to trim.
  916.      * <p>
  917.      * These characters are trimmed off on each side of the delimiter
  918.      * until the token or quote is found.
  919.      *
  920.      * @param trimmer  the trimmer matcher to use, null ignored
  921.      * @return this, to enable chaining
  922.      */
  923.     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
  924.         if (trimmer != null) {
  925.             this.trimmerMatcher = trimmer;
  926.         }
  927.         return this;
  928.     }

  929.     //-----------------------------------------------------------------------
  930.     /**
  931.      * Gets whether the tokenizer currently returns empty tokens as null.
  932.      * The default for this property is false.
  933.      *
  934.      * @return true if empty tokens are returned as null
  935.      */
  936.     public boolean isEmptyTokenAsNull() {
  937.         return this.emptyAsNull;
  938.     }

  939.     /**
  940.      * Sets whether the tokenizer should return empty tokens as null.
  941.      * The default for this property is false.
  942.      *
  943.      * @param emptyAsNull  whether empty tokens are returned as null
  944.      * @return this, to enable chaining
  945.      */
  946.     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
  947.         this.emptyAsNull = emptyAsNull;
  948.         return this;
  949.     }

  950.     //-----------------------------------------------------------------------
  951.     /**
  952.      * Gets whether the tokenizer currently ignores empty tokens.
  953.      * The default for this property is true.
  954.      *
  955.      * @return true if empty tokens are not returned
  956.      */
  957.     public boolean isIgnoreEmptyTokens() {
  958.         return ignoreEmptyTokens;
  959.     }

  960.     /**
  961.      * Sets whether the tokenizer should ignore and not return empty tokens.
  962.      * The default for this property is true.
  963.      *
  964.      * @param ignoreEmptyTokens  whether empty tokens are not returned
  965.      * @return this, to enable chaining
  966.      */
  967.     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
  968.         this.ignoreEmptyTokens = ignoreEmptyTokens;
  969.         return this;
  970.     }

  971.     //-----------------------------------------------------------------------
  972.     /**
  973.      * Gets the String content that the tokenizer is parsing.
  974.      *
  975.      * @return the string content being parsed
  976.      */
  977.     public String getContent() {
  978.         if (chars == null) {
  979.             return null;
  980.         }
  981.         return new String(chars);
  982.     }

  983.     //-----------------------------------------------------------------------
  984.     /**
  985.      * Creates a new instance of this Tokenizer. The new instance is reset so
  986.      * that it will be at the start of the token list.
  987.      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
  988.      *
  989.      * @return a new instance of this Tokenizer which has been reset.
  990.      */
  991.     @Override
  992.     public Object clone() {
  993.         try {
  994.             return cloneReset();
  995.         } catch (final CloneNotSupportedException ex) {
  996.             return null;
  997.         }
  998.     }

  999.     /**
  1000.      * Creates a new instance of this Tokenizer. The new instance is reset so that
  1001.      * it will be at the start of the token list.
  1002.      *
  1003.      * @return a new instance of this Tokenizer which has been reset.
  1004.      * @throws CloneNotSupportedException if there is a problem cloning
  1005.      */
  1006.     Object cloneReset() throws CloneNotSupportedException {
  1007.         // this method exists to enable 100% test coverage
  1008.         final StrTokenizer cloned = (StrTokenizer) super.clone();
  1009.         if (cloned.chars != null) {
  1010.             cloned.chars = cloned.chars.clone();
  1011.         }
  1012.         cloned.reset();
  1013.         return cloned;
  1014.     }

  1015.     //-----------------------------------------------------------------------
  1016.     /**
  1017.      * Gets the String content that the tokenizer is parsing.
  1018.      *
  1019.      * @return the string content being parsed
  1020.      */
  1021.     @Override
  1022.     public String toString() {
  1023.         if (tokens == null) {
  1024.             return "StrTokenizer[not tokenized yet]";
  1025.         }
  1026.         return "StrTokenizer" + getTokenList();
  1027.     }

  1028. }