Coverage Report - org.apache.commons.lang3.text.StrTokenizer
 
Classes in this File Line Coverage Branch Coverage Complexity
StrTokenizer
92%
230/250
97%
90/92
1,984
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.lang3.text;
 18  
 
 19  
 import java.util.ArrayList;
 20  
 import java.util.Collections;
 21  
 import java.util.List;
 22  
 import java.util.ListIterator;
 23  
 import java.util.NoSuchElementException;
 24  
 
 25  
 import org.apache.commons.lang3.ArrayUtils;
 26  
 import org.apache.commons.lang3.StringUtils;
 27  
 
 28  
 /**
 29  
  * Tokenizes a string based based on delimiters (separators)
 30  
  * and supporting quoting and ignored character concepts.
 31  
  * <p>
 32  
  * This class can split a String into many smaller strings. It aims
 33  
  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
 34  
  * however it offers much more control and flexibility including implementing
 35  
  * the <code>ListIterator</code> interface. By default, it is set up
 36  
  * like <code>StringTokenizer</code>.
 37  
  * <p>
 38  
  * The input String is split into a number of <i>tokens</i>.
 39  
  * Each token is separated from the next String by a <i>delimiter</i>.
 40  
  * One or more delimiter characters must be specified.
 41  
  * <p>
 42  
  * Each token may be surrounded by quotes.
 43  
  * The <i>quote</i> matcher specifies the quote character(s).
 44  
  * A quote may be escaped within a quoted section by duplicating itself.
 45  
  * <p>
 46  
  * Between each token and the delimiter are potentially characters that need trimming.
 47  
  * The <i>trimmer</i> matcher specifies these characters.
 48  
  * One usage might be to trim whitespace characters.
 49  
  * <p>
 50  
  * At any point outside the quotes there might potentially be invalid characters.
 51  
  * The <i>ignored</i> matcher specifies these characters to be removed.
 52  
  * One usage might be to remove new line characters.
 53  
  * <p>
 54  
  * Empty tokens may be removed or returned as null.
 55  
  * <pre>
 56  
  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
 57  
  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
 58  
  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
 59  
  * </pre>
 60  
  * <p>
 61  
  *
 62  
  * This tokenizer has the following properties and options:
 63  
  *
 64  
  * <table summary="Tokenizer Properties">
 65  
  *  <tr>
 66  
  *   <th>Property</th><th>Type</th><th>Default</th>
 67  
  *  </tr>
 68  
  *  <tr>
 69  
  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
 70  
  *  </tr>
 71  
  *  <tr>
 72  
  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
 73  
  *  </tr>
 74  
  *  <tr>
 75  
  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
 76  
  *  </tr>
 77  
  *  <tr>
 78  
  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
 79  
  *  </tr>
 80  
  *  <tr>
 81  
  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
 82  
  *  </tr>
 83  
  * </table>
 84  
  *
 85  
  * @since 2.2
 86  
  * @deprecated as of 3.6, use commons-text
 87  
  * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StrTokenizer.html">
 88  
  * StrTokenizer</a> instead
 89  
  */
 90  0
 @Deprecated
 91  
 public class StrTokenizer implements ListIterator<String>, Cloneable {
 92  
 
 93  
     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
 94  
     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
 95  
     static {
 96  1
         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
 97  1
         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
 98  1
         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
 99  1
         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
 100  1
         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
 101  1
         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
 102  1
         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
 103  
 
 104  1
         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
 105  1
         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
 106  1
         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
 107  1
         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
 108  1
         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
 109  1
         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
 110  1
         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
 111  1
     }
 112  
 
 113  
     /** The text to work on. */
 114  
     private char chars[];
 115  
     /** The parsed tokens */
 116  
     private String tokens[];
 117  
     /** The current iteration position */
 118  
     private int tokenPos;
 119  
 
 120  
     /** The delimiter matcher */
 121  67
     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
 122  
     /** The quote matcher */
 123  67
     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
 124  
     /** The ignored matcher */
 125  67
     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
 126  
     /** The trimmer matcher */
 127  67
     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
 128  
 
 129  
     /** Whether to return empty tokens as null */
 130  67
     private boolean emptyAsNull = false;
 131  
     /** Whether to ignore empty tokens */
 132  67
     private boolean ignoreEmptyTokens = true;
 133  
 
 134  
     //-----------------------------------------------------------------------
 135  
 
 136  
     /**
 137  
      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
 138  
      *
 139  
      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
 140  
      */
 141  
     private static StrTokenizer getCSVClone() {
 142  24
         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
 143  
     }
 144  
 
 145  
     /**
 146  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 147  
      * initializing it with the given input.  The default for CSV processing
 148  
      * will be trim whitespace from both ends (which can be overridden with
 149  
      * the setTrimmer method).
 150  
      * <p>
 151  
      * You must call a "reset" method to set the string which you want to parse.
 152  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 153  
      */
 154  
     public static StrTokenizer getCSVInstance() {
 155  15
         return getCSVClone();
 156  
     }
 157  
 
 158  
     /**
 159  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 160  
      * initializing it with the given input.  The default for CSV processing
 161  
      * will be trim whitespace from both ends (which can be overridden with
 162  
      * the setTrimmer method).
 163  
      *
 164  
      * @param input  the text to parse
 165  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 166  
      */
 167  
     public static StrTokenizer getCSVInstance(final String input) {
 168  5
         final StrTokenizer tok = getCSVClone();
 169  5
         tok.reset(input);
 170  5
         return tok;
 171  
     }
 172  
 
 173  
     /**
 174  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 175  
      * initializing it with the given input.  The default for CSV processing
 176  
      * will be trim whitespace from both ends (which can be overridden with
 177  
      * the setTrimmer method).
 178  
      *
 179  
      * @param input  the text to parse
 180  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 181  
      */
 182  
     public static StrTokenizer getCSVInstance(final char[] input) {
 183  4
         final StrTokenizer tok = getCSVClone();
 184  4
         tok.reset(input);
 185  4
         return tok;
 186  
     }
 187  
 
 188  
     /**
 189  
      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
 190  
      *
 191  
      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
 192  
      */
 193  
     private static StrTokenizer getTSVClone() {
 194  18
         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
 195  
     }
 196  
 
 197  
 
 198  
     /**
 199  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 200  
      * The default for CSV processing will be trim whitespace from both ends
 201  
      * (which can be overridden with the setTrimmer method).
 202  
      * <p>
 203  
      * You must call a "reset" method to set the string which you want to parse.
 204  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 205  
      */
 206  
     public static StrTokenizer getTSVInstance() {
 207  15
         return getTSVClone();
 208  
     }
 209  
 
 210  
     /**
 211  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 212  
      * The default for CSV processing will be trim whitespace from both ends
 213  
      * (which can be overridden with the setTrimmer method).
 214  
      * @param input  the string to parse
 215  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 216  
      */
 217  
     public static StrTokenizer getTSVInstance(final String input) {
 218  2
         final StrTokenizer tok = getTSVClone();
 219  2
         tok.reset(input);
 220  2
         return tok;
 221  
     }
 222  
 
 223  
     /**
 224  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 225  
      * The default for CSV processing will be trim whitespace from both ends
 226  
      * (which can be overridden with the setTrimmer method).
 227  
      * @param input  the string to parse
 228  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 229  
      */
 230  
     public static StrTokenizer getTSVInstance(final char[] input) {
 231  1
         final StrTokenizer tok = getTSVClone();
 232  1
         tok.reset(input);
 233  1
         return tok;
 234  
     }
 235  
 
 236  
     //-----------------------------------------------------------------------
 237  
     /**
 238  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 239  
      * as per StringTokenizer, but with no text to tokenize.
 240  
      * <p>
 241  
      * This constructor is normally used with {@link #reset(String)}.
 242  
      */
 243  
     public StrTokenizer() {
 244  6
         super();
 245  6
         this.chars = null;
 246  6
     }
 247  
 
 248  
     /**
 249  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 250  
      * as per StringTokenizer.
 251  
      *
 252  
      * @param input  the string which is to be parsed
 253  
      */
 254  
     public StrTokenizer(final String input) {
 255  49
         super();
 256  49
         if (input != null) {
 257  46
             chars = input.toCharArray();
 258  
         } else {
 259  3
             chars = null;
 260  
         }
 261  49
     }
 262  
 
 263  
     /**
 264  
      * Constructs a tokenizer splitting on the specified delimiter character.
 265  
      *
 266  
      * @param input  the string which is to be parsed
 267  
      * @param delim  the field delimiter character
 268  
      */
 269  
     public StrTokenizer(final String input, final char delim) {
 270  23
         this(input);
 271  23
         setDelimiterChar(delim);
 272  23
     }
 273  
 
 274  
     /**
 275  
      * Constructs a tokenizer splitting on the specified delimiter string.
 276  
      *
 277  
      * @param input  the string which is to be parsed
 278  
      * @param delim  the field delimiter string
 279  
      */
 280  
     public StrTokenizer(final String input, final String delim) {
 281  0
         this(input);
 282  0
         setDelimiterString(delim);
 283  0
     }
 284  
 
 285  
     /**
 286  
      * Constructs a tokenizer splitting using the specified delimiter matcher.
 287  
      *
 288  
      * @param input  the string which is to be parsed
 289  
      * @param delim  the field delimiter matcher
 290  
      */
 291  
     public StrTokenizer(final String input, final StrMatcher delim) {
 292  0
         this(input);
 293  0
         setDelimiterMatcher(delim);
 294  0
     }
 295  
 
 296  
     /**
 297  
      * Constructs a tokenizer splitting on the specified delimiter character
 298  
      * and handling quotes using the specified quote character.
 299  
      *
 300  
      * @param input  the string which is to be parsed
 301  
      * @param delim  the field delimiter character
 302  
      * @param quote  the field quoted string character
 303  
      */
 304  
     public StrTokenizer(final String input, final char delim, final char quote) {
 305  11
         this(input, delim);
 306  11
         setQuoteChar(quote);
 307  11
     }
 308  
 
 309  
     /**
 310  
      * Constructs a tokenizer splitting using the specified delimiter matcher
 311  
      * and handling quotes using the specified quote matcher.
 312  
      *
 313  
      * @param input  the string which is to be parsed
 314  
      * @param delim  the field delimiter matcher
 315  
      * @param quote  the field quoted string matcher
 316  
      */
 317  
     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
 318  0
         this(input, delim);
 319  0
         setQuoteMatcher(quote);
 320  0
     }
 321  
 
 322  
     /**
 323  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 324  
      * as per StringTokenizer.
 325  
      *
 326  
      * @param input  the string which is to be parsed, not cloned
 327  
      */
 328  
     public StrTokenizer(final char[] input) {
 329  12
         super();
 330  12
         this.chars = ArrayUtils.clone(input);
 331  12
     }
 332  
 
 333  
     /**
 334  
      * Constructs a tokenizer splitting on the specified character.
 335  
      *
 336  
      * @param input  the string which is to be parsed, not cloned
 337  
      * @param delim the field delimiter character
 338  
      */
 339  
     public StrTokenizer(final char[] input, final char delim) {
 340  6
         this(input);
 341  6
         setDelimiterChar(delim);
 342  6
     }
 343  
 
 344  
     /**
 345  
      * Constructs a tokenizer splitting on the specified string.
 346  
      *
 347  
      * @param input  the string which is to be parsed, not cloned
 348  
      * @param delim the field delimiter string
 349  
      */
 350  
     public StrTokenizer(final char[] input, final String delim) {
 351  0
         this(input);
 352  0
         setDelimiterString(delim);
 353  0
     }
 354  
 
 355  
     /**
 356  
      * Constructs a tokenizer splitting using the specified delimiter matcher.
 357  
      *
 358  
      * @param input  the string which is to be parsed, not cloned
 359  
      * @param delim  the field delimiter matcher
 360  
      */
 361  
     public StrTokenizer(final char[] input, final StrMatcher delim) {
 362  0
         this(input);
 363  0
         setDelimiterMatcher(delim);
 364  0
     }
 365  
 
 366  
     /**
 367  
      * Constructs a tokenizer splitting on the specified delimiter character
 368  
      * and handling quotes using the specified quote character.
 369  
      *
 370  
      * @param input  the string which is to be parsed, not cloned
 371  
      * @param delim  the field delimiter character
 372  
      * @param quote  the field quoted string character
 373  
      */
 374  
     public StrTokenizer(final char[] input, final char delim, final char quote) {
 375  3
         this(input, delim);
 376  3
         setQuoteChar(quote);
 377  3
     }
 378  
 
 379  
     /**
 380  
      * Constructs a tokenizer splitting using the specified delimiter matcher
 381  
      * and handling quotes using the specified quote matcher.
 382  
      *
 383  
      * @param input  the string which is to be parsed, not cloned
 384  
      * @param delim  the field delimiter character
 385  
      * @param quote  the field quoted string character
 386  
      */
 387  
     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
 388  0
         this(input, delim);
 389  0
         setQuoteMatcher(quote);
 390  0
     }
 391  
 
 392  
     // API
 393  
     //-----------------------------------------------------------------------
 394  
     /**
 395  
      * Gets the number of tokens found in the String.
 396  
      *
 397  
      * @return the number of matched tokens
 398  
      */
 399  
     public int size() {
 400  17
         checkTokenized();
 401  17
         return tokens.length;
 402  
     }
 403  
 
 404  
     /**
 405  
      * Gets the next token from the String.
 406  
      * Equivalent to {@link #next()} except it returns null rather than
 407  
      * throwing {@link NoSuchElementException} when no tokens remain.
 408  
      *
 409  
      * @return the next sequential token, or null when no more tokens are found
 410  
      */
 411  
     public String nextToken() {
 412  52
         if (hasNext()) {
 413  34
             return tokens[tokenPos++];
 414  
         }
 415  18
         return null;
 416  
     }
 417  
 
 418  
     /**
 419  
      * Gets the previous token from the String.
 420  
      *
 421  
      * @return the previous sequential token, or null when no more tokens are found
 422  
      */
 423  
     public String previousToken() {
 424  50
         if (hasPrevious()) {
 425  30
             return tokens[--tokenPos];
 426  
         }
 427  20
         return null;
 428  
     }
 429  
 
 430  
     /**
 431  
      * Gets a copy of the full token list as an independent modifiable array.
 432  
      *
 433  
      * @return the tokens as a String array
 434  
      */
 435  
     public String[] getTokenArray() {
 436  12
         checkTokenized();
 437  12
         return tokens.clone();
 438  
     }
 439  
 
 440  
     /**
 441  
      * Gets a copy of the full token list as an independent modifiable list.
 442  
      *
 443  
      * @return the tokens as a String array
 444  
      */
 445  
     public List<String> getTokenList() {
 446  2
         checkTokenized();
 447  2
         final List<String> list = new ArrayList<>(tokens.length);
 448  10
         for (final String element : tokens) {
 449  8
             list.add(element);
 450  
         }
 451  2
         return list;
 452  
     }
 453  
 
 454  
     /**
 455  
      * Resets this tokenizer, forgetting all parsing and iteration already completed.
 456  
      * <p>
 457  
      * This method allows the same tokenizer to be reused for the same String.
 458  
      *
 459  
      * @return this, to enable chaining
 460  
      */
 461  
     public StrTokenizer reset() {
 462  69
         tokenPos = 0;
 463  69
         tokens = null;
 464  69
         return this;
 465  
     }
 466  
 
 467  
     /**
 468  
      * Reset this tokenizer, giving it a new input string to parse.
 469  
      * In this manner you can re-use a tokenizer with the same settings
 470  
      * on multiple input lines.
 471  
      *
 472  
      * @param input  the new string to tokenize, null sets no text to parse
 473  
      * @return this, to enable chaining
 474  
      */
 475  
     public StrTokenizer reset(final String input) {
 476  10
         reset();
 477  10
         if (input != null) {
 478  9
             this.chars = input.toCharArray();
 479  
         } else {
 480  1
             this.chars = null;
 481  
         }
 482  10
         return this;
 483  
     }
 484  
 
 485  
     /**
 486  
      * Reset this tokenizer, giving it a new input string to parse.
 487  
      * In this manner you can re-use a tokenizer with the same settings
 488  
      * on multiple input lines.
 489  
      *
 490  
      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
 491  
      * @return this, to enable chaining
 492  
      */
 493  
     public StrTokenizer reset(final char[] input) {
 494  10
         reset();
 495  10
         this.chars = ArrayUtils.clone(input);
 496  10
         return this;
 497  
     }
 498  
 
 499  
     // ListIterator
 500  
     //-----------------------------------------------------------------------
 501  
     /**
 502  
      * Checks whether there are any more tokens.
 503  
      *
 504  
      * @return true if there are more tokens
 505  
      */
 506  
     @Override
 507  
     public boolean hasNext() {
 508  233
         checkTokenized();
 509  233
         return tokenPos < tokens.length;
 510  
     }
 511  
 
 512  
     /**
 513  
      * Gets the next token.
 514  
      *
 515  
      * @return the next String token
 516  
      * @throws NoSuchElementException if there are no more elements
 517  
      */
 518  
     @Override
 519  
     public String next() {
 520  116
         if (hasNext()) {
 521  111
             return tokens[tokenPos++];
 522  
         }
 523  5
         throw new NoSuchElementException();
 524  
     }
 525  
 
 526  
     /**
 527  
      * Gets the index of the next token to return.
 528  
      *
 529  
      * @return the next token index
 530  
      */
 531  
     @Override
 532  
     public int nextIndex() {
 533  90
         return tokenPos;
 534  
     }
 535  
 
 536  
     /**
 537  
      * Checks whether there are any previous tokens that can be iterated to.
 538  
      *
 539  
      * @return true if there are previous tokens
 540  
      */
 541  
     @Override
 542  
     public boolean hasPrevious() {
 543  77
         checkTokenized();
 544  77
         return tokenPos > 0;
 545  
     }
 546  
 
 547  
     /**
 548  
      * Gets the token previous to the last returned token.
 549  
      *
 550  
      * @return the previous token
 551  
      */
 552  
     @Override
 553  
     public String previous() {
 554  9
         if (hasPrevious()) {
 555  8
             return tokens[--tokenPos];
 556  
         }
 557  1
         throw new NoSuchElementException();
 558  
     }
 559  
 
 560  
     /**
 561  
      * Gets the index of the previous token.
 562  
      *
 563  
      * @return the previous token index
 564  
      */
 565  
     @Override
 566  
     public int previousIndex() {
 567  20
         return tokenPos - 1;
 568  
     }
 569  
 
 570  
     /**
 571  
      * Unsupported ListIterator operation.
 572  
      *
 573  
      * @throws UnsupportedOperationException always
 574  
      */
 575  
     @Override
 576  
     public void remove() {
 577  1
         throw new UnsupportedOperationException("remove() is unsupported");
 578  
     }
 579  
 
 580  
     /**
 581  
      * Unsupported ListIterator operation.
 582  
      * @param obj this parameter ignored.
 583  
      * @throws UnsupportedOperationException always
 584  
      */
 585  
     @Override
 586  
     public void set(final String obj) {
 587  1
         throw new UnsupportedOperationException("set() is unsupported");
 588  
     }
 589  
 
 590  
     /**
 591  
      * Unsupported ListIterator operation.
 592  
      * @param obj this parameter ignored.
 593  
      * @throws UnsupportedOperationException always
 594  
      */
 595  
     @Override
 596  
     public void add(final String obj) {
 597  1
         throw new UnsupportedOperationException("add() is unsupported");
 598  
     }
 599  
 
 600  
     // Implementation
 601  
     //-----------------------------------------------------------------------
 602  
     /**
 603  
      * Checks if tokenization has been done, and if not then do it.
 604  
      */
 605  
     private void checkTokenized() {
 606  341
         if (tokens == null) {
 607  84
             if (chars == null) {
 608  
                 // still call tokenize as subclass may do some work
 609  16
                 final List<String> split = tokenize(null, 0, 0);
 610  16
                 tokens = split.toArray(new String[split.size()]);
 611  16
             } else {
 612  68
                 final List<String> split = tokenize(chars, 0, chars.length);
 613  68
                 tokens = split.toArray(new String[split.size()]);
 614  
             }
 615  
         }
 616  341
     }
 617  
 
 618  
     /**
 619  
      * Internal method to performs the tokenization.
 620  
      * <p>
 621  
      * Most users of this class do not need to call this method. This method
 622  
      * will be called automatically by other (public) methods when required.
 623  
      * <p>
 624  
      * This method exists to allow subclasses to add code before or after the
 625  
      * tokenization. For example, a subclass could alter the character array,
 626  
      * offset or count to be parsed, or call the tokenizer multiple times on
 627  
      * multiple strings. It is also be possible to filter the results.
 628  
      * <p>
 629  
      * <code>StrTokenizer</code> will always pass a zero offset and a count
 630  
      * equal to the length of the array to this method, however a subclass
 631  
      * may pass other values, or even an entirely different array.
 632  
      *
 633  
      * @param srcChars  the character array being tokenized, may be null
 634  
      * @param offset  the start position within the character array, must be valid
 635  
      * @param count  the number of characters to tokenize, must be valid
 636  
      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
 637  
      */
 638  
     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
 639  84
         if (srcChars == null || count == 0) {
 640  22
             return Collections.emptyList();
 641  
         }
 642  62
         final StrBuilder buf = new StrBuilder();
 643  62
         final List<String> tokenList = new ArrayList<>();
 644  62
         int pos = offset;
 645  
 
 646  
         // loop around the entire buffer
 647  264
         while (pos >= 0 && pos < count) {
 648  
             // find next token
 649  202
             pos = readNextToken(srcChars, pos, count, buf, tokenList);
 650  
 
 651  
             // handle case where end of string is a delimiter
 652  202
             if (pos >= count) {
 653  12
                 addToken(tokenList, StringUtils.EMPTY);
 654  
             }
 655  
         }
 656  62
         return tokenList;
 657  
     }
 658  
 
 659  
     /**
 660  
      * Adds a token to a list, paying attention to the parameters we've set.
 661  
      *
 662  
      * @param list  the list to add to
 663  
      * @param tok  the token to add
 664  
      */
 665  
     private void addToken(final List<String> list, String tok) {
 666  214
         if (StringUtils.isEmpty(tok)) {
 667  35
             if (isIgnoreEmptyTokens()) {
 668  12
                 return;
 669  
             }
 670  23
             if (isEmptyTokenAsNull()) {
 671  11
                 tok = null;
 672  
             }
 673  
         }
 674  202
         list.add(tok);
 675  202
     }
 676  
 
 677  
     /**
 678  
      * Reads character by character through the String to get the next token.
 679  
      *
 680  
      * @param srcChars  the character array being tokenized
 681  
      * @param start  the first character of field
 682  
      * @param len  the length of the character array being tokenized
 683  
      * @param workArea  a temporary work area
 684  
      * @param tokenList  the list of parsed tokens
 685  
      * @return the starting position of the next field (the character
 686  
      *  immediately after the delimiter), or -1 if end of string found
 687  
      */
 688  
     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
 689  
         // skip all leading whitespace, unless it is the
 690  
         // field delimiter or the quote character
 691  271
         while (start < len) {
 692  532
             final int removeLen = Math.max(
 693  266
                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
 694  266
                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
 695  266
             if (removeLen == 0 ||
 696  69
                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
 697  69
                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
 698  0
                 break;
 699  
             }
 700  69
             start += removeLen;
 701  69
         }
 702  
 
 703  
         // handle reaching end
 704  202
         if (start >= len) {
 705  5
             addToken(tokenList, StringUtils.EMPTY);
 706  5
             return -1;
 707  
         }
 708  
 
 709  
         // handle empty token
 710  197
         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
 711  197
         if (delimLen > 0) {
 712  18
             addToken(tokenList, StringUtils.EMPTY);
 713  18
             return start + delimLen;
 714  
         }
 715  
 
 716  
         // handle found token
 717  179
         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
 718  179
         if (quoteLen > 0) {
 719  17
             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
 720  
         }
 721  162
         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
 722  
     }
 723  
 
 724  
     /**
 725  
      * Reads a possibly quoted string token.
 726  
      *
 727  
      * @param srcChars  the character array being tokenized
 728  
      * @param start  the first character of field
 729  
      * @param len  the length of the character array being tokenized
 730  
      * @param workArea  a temporary work area
 731  
      * @param tokenList  the list of parsed tokens
 732  
      * @param quoteStart  the start position of the matched quote, 0 if no quoting
 733  
      * @param quoteLen  the length of the matched quote, 0 if no quoting
 734  
      * @return the starting position of the next field (the character
 735  
      *  immediately after the delimiter, or if end of string found,
 736  
      *  then the length of string
 737  
      */
 738  
     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
 739  
                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
 740  
         // Loop until we've found the end of the quoted
 741  
         // string or the end of the input
 742  179
         workArea.clear();
 743  179
         int pos = start;
 744  179
         boolean quoting = quoteLen > 0;
 745  179
         int trimStart = 0;
 746  
 
 747  490
         while (pos < len) {
 748  
             // quoting mode can occur several times throughout a string
 749  
             // we must switch between quoting and non-quoting until we
 750  
             // encounter a non-quoted delimiter, or end of string
 751  445
             if (quoting) {
 752  
                 // In quoting mode
 753  
 
 754  
                 // If we've found a quote character, see if it's
 755  
                 // followed by a second quote.  If so, then we need
 756  
                 // to actually put the quote character into the token
 757  
                 // rather than end the token.
 758  93
                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
 759  27
                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
 760  
                         // matched pair of quotes, thus an escaped quote
 761  7
                         workArea.append(srcChars, pos, quoteLen);
 762  7
                         pos += quoteLen * 2;
 763  7
                         trimStart = workArea.size();
 764  7
                         continue;
 765  
                     }
 766  
 
 767  
                     // end of quoting
 768  20
                     quoting = false;
 769  20
                     pos += quoteLen;
 770  20
                     continue;
 771  
                 }
 772  
 
 773  
                 // copy regular character from inside quotes
 774  66
                 workArea.append(srcChars[pos++]);
 775  66
                 trimStart = workArea.size();
 776  
 
 777  
             } else {
 778  
                 // Not in quoting mode
 779  
 
 780  
                 // check for delimiter, and thus end of token
 781  352
                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
 782  352
                 if (delimLen > 0) {
 783  
                     // return condition when end of token found
 784  134
                     addToken(tokenList, workArea.substring(0, trimStart));
 785  134
                     return pos + delimLen;
 786  
                 }
 787  
 
 788  
                 // check for quote, and thus back into quoting mode
 789  218
                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
 790  4
                     quoting = true;
 791  4
                     pos += quoteLen;
 792  4
                     continue;
 793  
                 }
 794  
 
 795  
                 // check for ignored (outside quotes), and ignore
 796  214
                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
 797  214
                 if (ignoredLen > 0) {
 798  12
                     pos += ignoredLen;
 799  12
                     continue;
 800  
                 }
 801  
 
 802  
                 // check for trimmed character
 803  
                 // don't yet know if its at the end, so copy to workArea
 804  
                 // use trimStart to keep track of trim at the end
 805  202
                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
 806  202
                 if (trimmedLen > 0) {
 807  17
                     workArea.append(srcChars, pos, trimmedLen);
 808  17
                     pos += trimmedLen;
 809  17
                     continue;
 810  
                 }
 811  
 
 812  
                 // copy regular character from outside quotes
 813  185
                 workArea.append(srcChars[pos++]);
 814  185
                 trimStart = workArea.size();
 815  185
             }
 816  
         }
 817  
 
 818  
         // return condition when end of string found
 819  45
         addToken(tokenList, workArea.substring(0, trimStart));
 820  45
         return -1;
 821  
     }
 822  
 
 823  
     /**
 824  
      * Checks if the characters at the index specified match the quote
 825  
      * already matched in readNextToken().
 826  
      *
 827  
      * @param srcChars  the character array being tokenized
 828  
      * @param pos  the position to check for a quote
 829  
      * @param len  the length of the character array being tokenized
 830  
      * @param quoteStart  the start position of the matched quote, 0 if no quoting
 831  
      * @param quoteLen  the length of the matched quote, 0 if no quoting
 832  
      * @return true if a quote is matched
 833  
      */
 834  
     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
 835  173
         for (int i = 0; i < quoteLen; i++) {
 836  135
             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
 837  97
                 return false;
 838  
             }
 839  
         }
 840  38
         return true;
 841  
     }
 842  
 
 843  
     // Delimiter
 844  
     //-----------------------------------------------------------------------
 845  
     /**
 846  
      * Gets the field delimiter matcher.
 847  
      *
 848  
      * @return the delimiter matcher in use
 849  
      */
 850  
     public StrMatcher getDelimiterMatcher() {
 851  622
         return this.delimMatcher;
 852  
     }
 853  
 
 854  
     /**
 855  
      * Sets the field delimiter matcher.
 856  
      * <p>
 857  
      * The delimitier is used to separate one token from another.
 858  
      *
 859  
      * @param delim  the delimiter matcher to use
 860  
      * @return this, to enable chaining
 861  
      */
 862  
     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
 863  42
         if (delim == null) {
 864  1
             this.delimMatcher = StrMatcher.noneMatcher();
 865  
         } else {
 866  41
             this.delimMatcher = delim;
 867  
         }
 868  42
         return this;
 869  
     }
 870  
 
 871  
     /**
 872  
      * Sets the field delimiter character.
 873  
      *
 874  
      * @param delim  the delimiter character to use
 875  
      * @return this, to enable chaining
 876  
      */
 877  
     public StrTokenizer setDelimiterChar(final char delim) {
 878  36
         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
 879  
     }
 880  
 
 881  
     /**
 882  
      * Sets the field delimiter string.
 883  
      *
 884  
      * @param delim  the delimiter string to use
 885  
      * @return this, to enable chaining
 886  
      */
 887  
     public StrTokenizer setDelimiterString(final String delim) {
 888  1
         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
 889  
     }
 890  
 
 891  
     // Quote
 892  
     //-----------------------------------------------------------------------
 893  
     /**
 894  
      * Gets the quote matcher currently in use.
 895  
      * <p>
 896  
      * The quote character is used to wrap data between the tokens.
 897  
      * This enables delimiters to be entered as data.
 898  
      * The default value is '"' (double quote).
 899  
      *
 900  
      * @return the quote matcher in use
 901  
      */
 902  
     public StrMatcher getQuoteMatcher() {
 903  250
         return quoteMatcher;
 904  
     }
 905  
 
 906  
     /**
 907  
      * Set the quote matcher to use.
 908  
      * <p>
 909  
      * The quote character is used to wrap data between the tokens.
 910  
      * This enables delimiters to be entered as data.
 911  
      *
 912  
      * @param quote  the quote matcher to use, null ignored
 913  
      * @return this, to enable chaining
 914  
      */
 915  
     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
 916  28
         if (quote != null) {
 917  27
             this.quoteMatcher = quote;
 918  
         }
 919  28
         return this;
 920  
     }
 921  
 
 922  
     /**
 923  
      * Sets the quote character to use.
 924  
      * <p>
 925  
      * The quote character is used to wrap data between the tokens.
 926  
      * This enables delimiters to be entered as data.
 927  
      *
 928  
      * @param quote  the quote character to use
 929  
      * @return this, to enable chaining
 930  
      */
 931  
     public StrTokenizer setQuoteChar(final char quote) {
 932  21
         return setQuoteMatcher(StrMatcher.charMatcher(quote));
 933  
     }
 934  
 
 935  
     // Ignored
 936  
     //-----------------------------------------------------------------------
 937  
     /**
 938  
      * Gets the ignored character matcher.
 939  
      * <p>
 940  
      * These characters are ignored when parsing the String, unless they are
 941  
      * within a quoted region.
 942  
      * The default value is not to ignore anything.
 943  
      *
 944  
      * @return the ignored matcher in use
 945  
      */
 946  
     public StrMatcher getIgnoredMatcher() {
 947  480
         return ignoredMatcher;
 948  
     }
 949  
 
 950  
     /**
 951  
      * Set the matcher for characters to ignore.
 952  
      * <p>
 953  
      * These characters are ignored when parsing the String, unless they are
 954  
      * within a quoted region.
 955  
      *
 956  
      * @param ignored  the ignored matcher to use, null ignored
 957  
      * @return this, to enable chaining
 958  
      */
 959  
     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
 960  16
         if (ignored != null) {
 961  15
             this.ignoredMatcher = ignored;
 962  
         }
 963  16
         return this;
 964  
     }
 965  
 
 966  
     /**
 967  
      * Set the character to ignore.
 968  
      * <p>
 969  
      * This character is ignored when parsing the String, unless it is
 970  
      * within a quoted region.
 971  
      *
 972  
      * @param ignored  the ignored character to use
 973  
      * @return this, to enable chaining
 974  
      */
 975  
     public StrTokenizer setIgnoredChar(final char ignored) {
 976  1
         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
 977  
     }
 978  
 
 979  
     // Trimmer
 980  
     //-----------------------------------------------------------------------
 981  
     /**
 982  
      * Gets the trimmer character matcher.
 983  
      * <p>
 984  
      * These characters are trimmed off on each side of the delimiter
 985  
      * until the token or quote is found.
 986  
      * The default value is not to trim anything.
 987  
      *
 988  
      * @return the trimmer matcher in use
 989  
      */
 990  
     public StrMatcher getTrimmerMatcher() {
 991  468
         return trimmerMatcher;
 992  
     }
 993  
 
 994  
     /**
 995  
      * Sets the matcher for characters to trim.
 996  
      * <p>
 997  
      * These characters are trimmed off on each side of the delimiter
 998  
      * until the token or quote is found.
 999  
      *
 1000  
      * @param trimmer  the trimmer matcher to use, null ignored
 1001  
      * @return this, to enable chaining
 1002  
      */
 1003  
     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
 1004  11
         if (trimmer != null) {
 1005  10
             this.trimmerMatcher = trimmer;
 1006  
         }
 1007  11
         return this;
 1008  
     }
 1009  
 
 1010  
     //-----------------------------------------------------------------------
 1011  
     /**
 1012  
      * Gets whether the tokenizer currently returns empty tokens as null.
 1013  
      * The default for this property is false.
 1014  
      *
 1015  
      * @return true if empty tokens are returned as null
 1016  
      */
 1017  
     public boolean isEmptyTokenAsNull() {
 1018  23
         return this.emptyAsNull;
 1019  
     }
 1020  
 
 1021  
     /**
 1022  
      * Sets whether the tokenizer should return empty tokens as null.
 1023  
      * The default for this property is false.
 1024  
      *
 1025  
      * @param emptyAsNull  whether empty tokens are returned as null
 1026  
      * @return this, to enable chaining
 1027  
      */
 1028  
     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
 1029  16
         this.emptyAsNull = emptyAsNull;
 1030  16
         return this;
 1031  
     }
 1032  
 
 1033  
     //-----------------------------------------------------------------------
 1034  
     /**
 1035  
      * Gets whether the tokenizer currently ignores empty tokens.
 1036  
      * The default for this property is true.
 1037  
      *
 1038  
      * @return true if empty tokens are not returned
 1039  
      */
 1040  
     public boolean isIgnoreEmptyTokens() {
 1041  35
         return ignoreEmptyTokens;
 1042  
     }
 1043  
 
 1044  
     /**
 1045  
      * Sets whether the tokenizer should ignore and not return empty tokens.
 1046  
      * The default for this property is true.
 1047  
      *
 1048  
      * @param ignoreEmptyTokens  whether empty tokens are not returned
 1049  
      * @return this, to enable chaining
 1050  
      */
 1051  
     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
 1052  24
         this.ignoreEmptyTokens = ignoreEmptyTokens;
 1053  24
         return this;
 1054  
     }
 1055  
 
 1056  
     //-----------------------------------------------------------------------
 1057  
     /**
 1058  
      * Gets the String content that the tokenizer is parsing.
 1059  
      *
 1060  
      * @return the string content being parsed
 1061  
      */
 1062  
     public String getContent() {
 1063  4
         if (chars == null) {
 1064  2
             return null;
 1065  
         }
 1066  2
         return new String(chars);
 1067  
     }
 1068  
 
 1069  
     //-----------------------------------------------------------------------
 1070  
     /**
 1071  
      * Creates a new instance of this Tokenizer. The new instance is reset so
 1072  
      * that it will be at the start of the token list.
 1073  
      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
 1074  
      *
 1075  
      * @return a new instance of this Tokenizer which has been reset.
 1076  
      */
 1077  
     @Override
 1078  
     public Object clone() {
 1079  
         try {
 1080  45
             return cloneReset();
 1081  1
         } catch (final CloneNotSupportedException ex) {
 1082  1
             return null;
 1083  
         }
 1084  
     }
 1085  
 
 1086  
     /**
 1087  
      * Creates a new instance of this Tokenizer. The new instance is reset so that
 1088  
      * it will be at the start of the token list.
 1089  
      *
 1090  
      * @return a new instance of this Tokenizer which has been reset.
 1091  
      * @throws CloneNotSupportedException if there is a problem cloning
 1092  
      */
 1093  
     Object cloneReset() throws CloneNotSupportedException {
 1094  
         // this method exists to enable 100% test coverage
 1095  44
         final StrTokenizer cloned = (StrTokenizer) super.clone();
 1096  44
         if (cloned.chars != null) {
 1097  1
             cloned.chars = cloned.chars.clone();
 1098  
         }
 1099  44
         cloned.reset();
 1100  44
         return cloned;
 1101  
     }
 1102  
 
 1103  
     //-----------------------------------------------------------------------
 1104  
     /**
 1105  
      * Gets the String content that the tokenizer is parsing.
 1106  
      *
 1107  
      * @return the string content being parsed
 1108  
      */
 1109  
     @Override
 1110  
     public String toString() {
 1111  2
         if (tokens == null) {
 1112  1
             return "StrTokenizer[not tokenized yet]";
 1113  
         }
 1114  1
         return "StrTokenizer" + getTokenList();
 1115  
     }
 1116  
 
 1117  
 }