Coverage Report - org.apache.commons.lang3.text.StrTokenizer
 
Classes in this File Line Coverage Branch Coverage Complexity
StrTokenizer
91%
226/246
97%
90/92
1,984
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.lang3.text;
 18  
 
 19  
 import java.util.ArrayList;
 20  
 import java.util.Collections;
 21  
 import java.util.List;
 22  
 import java.util.ListIterator;
 23  
 import java.util.NoSuchElementException;
 24  
 
 25  
 import org.apache.commons.lang3.ArrayUtils;
 26  
 import org.apache.commons.lang3.StringUtils;
 27  
 
 28  
 /**
 29  
  * Tokenizes a string based based on delimiters (separators)
 30  
  * and supporting quoting and ignored character concepts.
 31  
  * <p>
 32  
  * This class can split a String into many smaller strings. It aims
 33  
  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
 34  
  * however it offers much more control and flexibility including implementing
 35  
  * the <code>ListIterator</code> interface. By default, it is set up
 36  
  * like <code>StringTokenizer</code>.
 37  
  * <p>
 38  
  * The input String is split into a number of <i>tokens</i>.
 39  
  * Each token is separated from the next String by a <i>delimiter</i>.
 40  
  * One or more delimiter characters must be specified.
 41  
  * <p>
 42  
  * Each token may be surrounded by quotes.
 43  
  * The <i>quote</i> matcher specifies the quote character(s).
 44  
  * A quote may be escaped within a quoted section by duplicating itself.
 45  
  * <p>
 46  
  * Between each token and the delimiter are potentially characters that need trimming.
 47  
  * The <i>trimmer</i> matcher specifies these characters.
 48  
  * One usage might be to trim whitespace characters.
 49  
  * <p>
 50  
  * At any point outside the quotes there might potentially be invalid characters.
 51  
  * The <i>ignored</i> matcher specifies these characters to be removed.
 52  
  * One usage might be to remove new line characters.
 53  
  * <p>
 54  
  * Empty tokens may be removed or returned as null.
 55  
  * <pre>
 56  
  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
 57  
  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
 58  
  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
 59  
  * </pre>
 60  
  * <p>
 61  
  *
 62  
  * This tokenizer has the following properties and options:
 63  
  *
 64  
  * <table summary="Tokenizer Properties">
 65  
  *  <tr>
 66  
  *   <th>Property</th><th>Type</th><th>Default</th>
 67  
  *  </tr>
 68  
  *  <tr>
 69  
  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
 70  
  *  </tr>
 71  
  *  <tr>
 72  
  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
 73  
  *  </tr>
 74  
  *  <tr>
 75  
  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
 76  
  *  </tr>
 77  
  *  <tr>
 78  
  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
 79  
  *  </tr>
 80  
  *  <tr>
 81  
  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
 82  
  *  </tr>
 83  
  * </table>
 84  
  *
 85  
  * @since 2.2
 86  
  * @version $Id: StrTokenizer.java 1583482 2014-03-31 22:54:57Z niallp $
 87  
  */
 88  0
 public class StrTokenizer implements ListIterator<String>, Cloneable {
 89  
 
 90  
     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
 91  
     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
 92  
     static {
 93  1
         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
 94  1
         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
 95  1
         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
 96  1
         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
 97  1
         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
 98  1
         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
 99  1
         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
 100  
 
 101  1
         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
 102  1
         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
 103  1
         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
 104  1
         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
 105  1
         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
 106  1
         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
 107  1
         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
 108  1
     }
 109  
 
 110  
     /** The text to work on. */
 111  
     private char chars[];
 112  
     /** The parsed tokens */
 113  
     private String tokens[];
 114  
     /** The current iteration position */
 115  
     private int tokenPos;
 116  
 
 117  
     /** The delimiter matcher */
 118  67
     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
 119  
     /** The quote matcher */
 120  67
     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
 121  
     /** The ignored matcher */
 122  67
     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
 123  
     /** The trimmer matcher */
 124  67
     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
 125  
 
 126  
     /** Whether to return empty tokens as null */
 127  67
     private boolean emptyAsNull = false;
 128  
     /** Whether to ignore empty tokens */
 129  67
     private boolean ignoreEmptyTokens = true;
 130  
 
 131  
     //-----------------------------------------------------------------------
 132  
 
 133  
     /**
 134  
      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
 135  
      * 
 136  
      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
 137  
      */
 138  
     private static StrTokenizer getCSVClone() {
 139  26
         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
 140  
     }
 141  
 
 142  
     /**
 143  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 144  
      * initializing it with the given input.  The default for CSV processing
 145  
      * will be trim whitespace from both ends (which can be overridden with
 146  
      * the setTrimmer method).
 147  
      * <p>
 148  
      * You must call a "reset" method to set the string which you want to parse.
 149  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 150  
      */
 151  
     public static StrTokenizer getCSVInstance() {
 152  16
         return getCSVClone();
 153  
     }
 154  
 
 155  
     /**
 156  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 157  
      * initializing it with the given input.  The default for CSV processing
 158  
      * will be trim whitespace from both ends (which can be overridden with
 159  
      * the setTrimmer method).
 160  
      *
 161  
      * @param input  the text to parse
 162  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 163  
      */
 164  
     public static StrTokenizer getCSVInstance(final String input) {
 165  6
         final StrTokenizer tok = getCSVClone();
 166  6
         tok.reset(input);
 167  6
         return tok;
 168  
     }
 169  
 
 170  
     /**
 171  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 172  
      * initializing it with the given input.  The default for CSV processing
 173  
      * will be trim whitespace from both ends (which can be overridden with
 174  
      * the setTrimmer method).
 175  
      *
 176  
      * @param input  the text to parse
 177  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 178  
      */
 179  
     public static StrTokenizer getCSVInstance(final char[] input) {
 180  4
         final StrTokenizer tok = getCSVClone();
 181  4
         tok.reset(input);
 182  4
         return tok;
 183  
     }
 184  
 
 185  
     /**
 186  
      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
 187  
      * 
 188  
      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
 189  
      */
 190  
     private static StrTokenizer getTSVClone() {
 191  16
         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
 192  
     }
 193  
 
 194  
 
 195  
     /**
 196  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 197  
      * The default for CSV processing will be trim whitespace from both ends
 198  
      * (which can be overridden with the setTrimmer method).
 199  
      * <p>
 200  
      * You must call a "reset" method to set the string which you want to parse.
 201  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 202  
      */
 203  
     public static StrTokenizer getTSVInstance() {
 204  14
         return getTSVClone();
 205  
     }
 206  
 
 207  
     /**
 208  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 209  
      * The default for CSV processing will be trim whitespace from both ends
 210  
      * (which can be overridden with the setTrimmer method).
 211  
      * @param input  the string to parse
 212  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 213  
      */
 214  
     public static StrTokenizer getTSVInstance(final String input) {
 215  1
         final StrTokenizer tok = getTSVClone();
 216  1
         tok.reset(input);
 217  1
         return tok;
 218  
     }
 219  
 
 220  
     /**
 221  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 222  
      * The default for CSV processing will be trim whitespace from both ends
 223  
      * (which can be overridden with the setTrimmer method).
 224  
      * @param input  the string to parse
 225  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 226  
      */
 227  
     public static StrTokenizer getTSVInstance(final char[] input) {
 228  1
         final StrTokenizer tok = getTSVClone();
 229  1
         tok.reset(input);
 230  1
         return tok;
 231  
     }
 232  
 
 233  
     //-----------------------------------------------------------------------
 234  
     /**
 235  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 236  
      * as per StringTokenizer, but with no text to tokenize.
 237  
      * <p>
 238  
      * This constructor is normally used with {@link #reset(String)}.
 239  
      */
 240  
     public StrTokenizer() {
 241  6
         super();
 242  6
         this.chars = null;
 243  6
     }
 244  
 
 245  
     /**
 246  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 247  
      * as per StringTokenizer.
 248  
      *
 249  
      * @param input  the string which is to be parsed
 250  
      */
 251  
     public StrTokenizer(final String input) {
 252  49
         super();
 253  49
         if (input != null) {
 254  46
             chars = input.toCharArray();
 255  
         } else {
 256  3
             chars = null;
 257  
         }
 258  49
     }
 259  
 
 260  
     /**
 261  
      * Constructs a tokenizer splitting on the specified delimiter character.
 262  
      *
 263  
      * @param input  the string which is to be parsed
 264  
      * @param delim  the field delimiter character
 265  
      */
 266  
     public StrTokenizer(final String input, final char delim) {
 267  23
         this(input);
 268  23
         setDelimiterChar(delim);
 269  23
     }
 270  
 
 271  
     /**
 272  
      * Constructs a tokenizer splitting on the specified delimiter string.
 273  
      *
 274  
      * @param input  the string which is to be parsed
 275  
      * @param delim  the field delimiter string
 276  
      */
 277  
     public StrTokenizer(final String input, final String delim) {
 278  0
         this(input);
 279  0
         setDelimiterString(delim);
 280  0
     }
 281  
 
 282  
     /**
 283  
      * Constructs a tokenizer splitting using the specified delimiter matcher.
 284  
      *
 285  
      * @param input  the string which is to be parsed
 286  
      * @param delim  the field delimiter matcher
 287  
      */
 288  
     public StrTokenizer(final String input, final StrMatcher delim) {
 289  0
         this(input);
 290  0
         setDelimiterMatcher(delim);
 291  0
     }
 292  
 
 293  
     /**
 294  
      * Constructs a tokenizer splitting on the specified delimiter character
 295  
      * and handling quotes using the specified quote character.
 296  
      *
 297  
      * @param input  the string which is to be parsed
 298  
      * @param delim  the field delimiter character
 299  
      * @param quote  the field quoted string character
 300  
      */
 301  
     public StrTokenizer(final String input, final char delim, final char quote) {
 302  11
         this(input, delim);
 303  11
         setQuoteChar(quote);
 304  11
     }
 305  
 
 306  
     /**
 307  
      * Constructs a tokenizer splitting using the specified delimiter matcher
 308  
      * and handling quotes using the specified quote matcher.
 309  
      *
 310  
      * @param input  the string which is to be parsed
 311  
      * @param delim  the field delimiter matcher
 312  
      * @param quote  the field quoted string matcher
 313  
      */
 314  
     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
 315  0
         this(input, delim);
 316  0
         setQuoteMatcher(quote);
 317  0
     }
 318  
 
 319  
     /**
 320  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 321  
      * as per StringTokenizer.
 322  
      *
 323  
      * @param input  the string which is to be parsed, not cloned
 324  
      */
 325  
     public StrTokenizer(final char[] input) {
 326  12
         super();
 327  12
         this.chars = ArrayUtils.clone(input);
 328  12
     }
 329  
 
 330  
     /**
 331  
      * Constructs a tokenizer splitting on the specified character.
 332  
      *
 333  
      * @param input  the string which is to be parsed, not cloned
 334  
      * @param delim the field delimiter character
 335  
      */
 336  
     public StrTokenizer(final char[] input, final char delim) {
 337  6
         this(input);
 338  6
         setDelimiterChar(delim);
 339  6
     }
 340  
 
 341  
     /**
 342  
      * Constructs a tokenizer splitting on the specified string.
 343  
      *
 344  
      * @param input  the string which is to be parsed, not cloned
 345  
      * @param delim the field delimiter string
 346  
      */
 347  
     public StrTokenizer(final char[] input, final String delim) {
 348  0
         this(input);
 349  0
         setDelimiterString(delim);
 350  0
     }
 351  
 
 352  
     /**
 353  
      * Constructs a tokenizer splitting using the specified delimiter matcher.
 354  
      *
 355  
      * @param input  the string which is to be parsed, not cloned
 356  
      * @param delim  the field delimiter matcher
 357  
      */
 358  
     public StrTokenizer(final char[] input, final StrMatcher delim) {
 359  0
         this(input);
 360  0
         setDelimiterMatcher(delim);
 361  0
     }
 362  
 
 363  
     /**
 364  
      * Constructs a tokenizer splitting on the specified delimiter character
 365  
      * and handling quotes using the specified quote character.
 366  
      *
 367  
      * @param input  the string which is to be parsed, not cloned
 368  
      * @param delim  the field delimiter character
 369  
      * @param quote  the field quoted string character
 370  
      */
 371  
     public StrTokenizer(final char[] input, final char delim, final char quote) {
 372  3
         this(input, delim);
 373  3
         setQuoteChar(quote);
 374  3
     }
 375  
 
 376  
     /**
 377  
      * Constructs a tokenizer splitting using the specified delimiter matcher
 378  
      * and handling quotes using the specified quote matcher.
 379  
      *
 380  
      * @param input  the string which is to be parsed, not cloned
 381  
      * @param delim  the field delimiter character
 382  
      * @param quote  the field quoted string character
 383  
      */
 384  
     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
 385  0
         this(input, delim);
 386  0
         setQuoteMatcher(quote);
 387  0
     }
 388  
 
 389  
     // API
 390  
     //-----------------------------------------------------------------------
 391  
     /**
 392  
      * Gets the number of tokens found in the String.
 393  
      *
 394  
      * @return the number of matched tokens
 395  
      */
 396  
     public int size() {
 397  17
         checkTokenized();
 398  17
         return tokens.length;
 399  
     }
 400  
 
 401  
     /**
 402  
      * Gets the next token from the String.
 403  
      * Equivalent to {@link #next()} except it returns null rather than
 404  
      * throwing {@link NoSuchElementException} when no tokens remain.
 405  
      *
 406  
      * @return the next sequential token, or null when no more tokens are found
 407  
      */
 408  
     public String nextToken() {
 409  52
         if (hasNext()) {
 410  34
             return tokens[tokenPos++];
 411  
         }
 412  18
         return null;
 413  
     }
 414  
 
 415  
     /**
 416  
      * Gets the previous token from the String.
 417  
      *
 418  
      * @return the previous sequential token, or null when no more tokens are found
 419  
      */
 420  
     public String previousToken() {
 421  50
         if (hasPrevious()) {
 422  30
             return tokens[--tokenPos];
 423  
         }
 424  20
         return null;
 425  
     }
 426  
 
 427  
     /**
 428  
      * Gets a copy of the full token list as an independent modifiable array.
 429  
      *
 430  
      * @return the tokens as a String array
 431  
      */
 432  
     public String[] getTokenArray() {
 433  12
         checkTokenized();
 434  12
         return tokens.clone();
 435  
     }
 436  
 
 437  
     /**
 438  
      * Gets a copy of the full token list as an independent modifiable list.
 439  
      *
 440  
      * @return the tokens as a String array
 441  
      */
 442  
     public List<String> getTokenList() {
 443  2
         checkTokenized();
 444  2
         final List<String> list = new ArrayList<String>(tokens.length);
 445  10
         for (final String element : tokens) {
 446  8
             list.add(element);
 447  
         }
 448  2
         return list;
 449  
     }
 450  
 
 451  
     /**
 452  
      * Resets this tokenizer, forgetting all parsing and iteration already completed.
 453  
      * <p>
 454  
      * This method allows the same tokenizer to be reused for the same String.
 455  
      *
 456  
      * @return this, to enable chaining
 457  
      */
 458  
     public StrTokenizer reset() {
 459  69
         tokenPos = 0;
 460  69
         tokens = null;
 461  69
         return this;
 462  
     }
 463  
 
 464  
     /**
 465  
      * Reset this tokenizer, giving it a new input string to parse.
 466  
      * In this manner you can re-use a tokenizer with the same settings
 467  
      * on multiple input lines.
 468  
      *
 469  
      * @param input  the new string to tokenize, null sets no text to parse
 470  
      * @return this, to enable chaining
 471  
      */
 472  
     public StrTokenizer reset(final String input) {
 473  10
         reset();
 474  10
         if (input != null) {
 475  9
             this.chars = input.toCharArray();
 476  
         } else {
 477  1
             this.chars = null;
 478  
         }
 479  10
         return this;
 480  
     }
 481  
 
 482  
     /**
 483  
      * Reset this tokenizer, giving it a new input string to parse.
 484  
      * In this manner you can re-use a tokenizer with the same settings
 485  
      * on multiple input lines.
 486  
      *
 487  
      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
 488  
      * @return this, to enable chaining
 489  
      */
 490  
     public StrTokenizer reset(final char[] input) {
 491  10
         reset();
 492  10
         this.chars = ArrayUtils.clone(input);
 493  10
         return this;
 494  
     }
 495  
 
 496  
     // ListIterator
 497  
     //-----------------------------------------------------------------------
 498  
     /**
 499  
      * Checks whether there are any more tokens.
 500  
      *
 501  
      * @return true if there are more tokens
 502  
      */
 503  
     @Override
 504  
     public boolean hasNext() {
 505  233
         checkTokenized();
 506  233
         return tokenPos < tokens.length;
 507  
     }
 508  
 
 509  
     /**
 510  
      * Gets the next token.
 511  
      *
 512  
      * @return the next String token
 513  
      * @throws NoSuchElementException if there are no more elements
 514  
      */
 515  
     @Override
 516  
     public String next() {
 517  116
         if (hasNext()) {
 518  111
             return tokens[tokenPos++];
 519  
         }
 520  5
         throw new NoSuchElementException();
 521  
     }
 522  
 
 523  
     /**
 524  
      * Gets the index of the next token to return.
 525  
      *
 526  
      * @return the next token index
 527  
      */
 528  
     @Override
 529  
     public int nextIndex() {
 530  90
         return tokenPos;
 531  
     }
 532  
 
 533  
     /**
 534  
      * Checks whether there are any previous tokens that can be iterated to.
 535  
      *
 536  
      * @return true if there are previous tokens
 537  
      */
 538  
     @Override
 539  
     public boolean hasPrevious() {
 540  77
         checkTokenized();
 541  77
         return tokenPos > 0;
 542  
     }
 543  
 
 544  
     /**
 545  
      * Gets the token previous to the last returned token.
 546  
      *
 547  
      * @return the previous token
 548  
      */
 549  
     @Override
 550  
     public String previous() {
 551  9
         if (hasPrevious()) {
 552  8
             return tokens[--tokenPos];
 553  
         }
 554  1
         throw new NoSuchElementException();
 555  
     }
 556  
 
 557  
     /**
 558  
      * Gets the index of the previous token.
 559  
      *
 560  
      * @return the previous token index
 561  
      */
 562  
     @Override
 563  
     public int previousIndex() {
 564  20
         return tokenPos - 1;
 565  
     }
 566  
 
 567  
     /**
 568  
      * Unsupported ListIterator operation.
 569  
      *
 570  
      * @throws UnsupportedOperationException always
 571  
      */
 572  
     @Override
 573  
     public void remove() {
 574  1
         throw new UnsupportedOperationException("remove() is unsupported");
 575  
     }
 576  
 
 577  
     /**
 578  
      * Unsupported ListIterator operation.
 579  
      * @param obj this parameter ignored.
 580  
      * @throws UnsupportedOperationException always
 581  
      */
 582  
     @Override
 583  
     public void set(final String obj) {
 584  1
         throw new UnsupportedOperationException("set() is unsupported");
 585  
     }
 586  
 
 587  
     /**
 588  
      * Unsupported ListIterator operation.
 589  
      * @param obj this parameter ignored.
 590  
      * @throws UnsupportedOperationException always
 591  
      */
 592  
     @Override
 593  
     public void add(final String obj) {
 594  1
         throw new UnsupportedOperationException("add() is unsupported");
 595  
     }
 596  
 
 597  
     // Implementation
 598  
     //-----------------------------------------------------------------------
 599  
     /**
 600  
      * Checks if tokenization has been done, and if not then do it.
 601  
      */
 602  
     private void checkTokenized() {
 603  341
         if (tokens == null) {
 604  84
             if (chars == null) {
 605  
                 // still call tokenize as subclass may do some work
 606  16
                 final List<String> split = tokenize(null, 0, 0);
 607  16
                 tokens = split.toArray(new String[split.size()]);
 608  16
             } else {
 609  68
                 final List<String> split = tokenize(chars, 0, chars.length);
 610  68
                 tokens = split.toArray(new String[split.size()]);
 611  
             }
 612  
         }
 613  341
     }
 614  
 
 615  
     /**
 616  
      * Internal method to performs the tokenization.
 617  
      * <p>
 618  
      * Most users of this class do not need to call this method. This method
 619  
      * will be called automatically by other (public) methods when required.
 620  
      * <p>
 621  
      * This method exists to allow subclasses to add code before or after the
 622  
      * tokenization. For example, a subclass could alter the character array,
 623  
      * offset or count to be parsed, or call the tokenizer multiple times on
 624  
      * multiple strings. It is also be possible to filter the results.
 625  
      * <p>
 626  
      * <code>StrTokenizer</code> will always pass a zero offset and a count
 627  
      * equal to the length of the array to this method, however a subclass
 628  
      * may pass other values, or even an entirely different array.
 629  
      * 
 630  
      * @param srcChars  the character array being tokenized, may be null
 631  
      * @param offset  the start position within the character array, must be valid
 632  
      * @param count  the number of characters to tokenize, must be valid
 633  
      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
 634  
      */
 635  
     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
 636  84
         if (srcChars == null || count == 0) {
 637  22
             return Collections.emptyList();
 638  
         }
 639  62
         final StrBuilder buf = new StrBuilder();
 640  62
         final List<String> tokenList = new ArrayList<String>();
 641  62
         int pos = offset;
 642  
         
 643  
         // loop around the entire buffer
 644  264
         while (pos >= 0 && pos < count) {
 645  
             // find next token
 646  202
             pos = readNextToken(srcChars, pos, count, buf, tokenList);
 647  
             
 648  
             // handle case where end of string is a delimiter
 649  202
             if (pos >= count) {
 650  12
                 addToken(tokenList, "");
 651  
             }
 652  
         }
 653  62
         return tokenList;
 654  
     }
 655  
 
 656  
     /**
 657  
      * Adds a token to a list, paying attention to the parameters we've set.
 658  
      *
 659  
      * @param list  the list to add to
 660  
      * @param tok  the token to add
 661  
      */
 662  
     private void addToken(final List<String> list, String tok) {
 663  214
         if (StringUtils.isEmpty(tok)) {
 664  35
             if (isIgnoreEmptyTokens()) {
 665  12
                 return;
 666  
             }
 667  23
             if (isEmptyTokenAsNull()) {
 668  11
                 tok = null;
 669  
             }
 670  
         }
 671  202
         list.add(tok);
 672  202
     }
 673  
 
 674  
     /**
 675  
      * Reads character by character through the String to get the next token.
 676  
      *
 677  
      * @param srcChars  the character array being tokenized
 678  
      * @param start  the first character of field
 679  
      * @param len  the length of the character array being tokenized
 680  
      * @param workArea  a temporary work area
 681  
      * @param tokenList  the list of parsed tokens
 682  
      * @return the starting position of the next field (the character
 683  
      *  immediately after the delimiter), or -1 if end of string found
 684  
      */
 685  
     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
 686  
         // skip all leading whitespace, unless it is the
 687  
         // field delimiter or the quote character
 688  271
         while (start < len) {
 689  266
             final int removeLen = Math.max(
 690  
                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
 691  
                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
 692  266
             if (removeLen == 0 ||
 693  
                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
 694  
                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
 695  0
                 break;
 696  
             }
 697  69
             start += removeLen;
 698  69
         }
 699  
         
 700  
         // handle reaching end
 701  202
         if (start >= len) {
 702  5
             addToken(tokenList, "");
 703  5
             return -1;
 704  
         }
 705  
         
 706  
         // handle empty token
 707  197
         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
 708  197
         if (delimLen > 0) {
 709  18
             addToken(tokenList, "");
 710  18
             return start + delimLen;
 711  
         }
 712  
         
 713  
         // handle found token
 714  179
         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
 715  179
         if (quoteLen > 0) {
 716  17
             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
 717  
         }
 718  162
         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
 719  
     }
 720  
 
 721  
     /**
 722  
      * Reads a possibly quoted string token.
 723  
      *
 724  
      * @param srcChars  the character array being tokenized
 725  
      * @param start  the first character of field
 726  
      * @param len  the length of the character array being tokenized
 727  
      * @param workArea  a temporary work area
 728  
      * @param tokenList  the list of parsed tokens
 729  
      * @param quoteStart  the start position of the matched quote, 0 if no quoting
 730  
      * @param quoteLen  the length of the matched quote, 0 if no quoting
 731  
      * @return the starting position of the next field (the character
 732  
      *  immediately after the delimiter, or if end of string found,
 733  
      *  then the length of string
 734  
      */
 735  
     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 
 736  
                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
 737  
         // Loop until we've found the end of the quoted
 738  
         // string or the end of the input
 739  179
         workArea.clear();
 740  179
         int pos = start;
 741  179
         boolean quoting = quoteLen > 0;
 742  179
         int trimStart = 0;
 743  
         
 744  490
         while (pos < len) {
 745  
             // quoting mode can occur several times throughout a string
 746  
             // we must switch between quoting and non-quoting until we
 747  
             // encounter a non-quoted delimiter, or end of string
 748  445
             if (quoting) {
 749  
                 // In quoting mode
 750  
                 
 751  
                 // If we've found a quote character, see if it's
 752  
                 // followed by a second quote.  If so, then we need
 753  
                 // to actually put the quote character into the token
 754  
                 // rather than end the token.
 755  93
                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
 756  27
                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
 757  
                         // matched pair of quotes, thus an escaped quote
 758  7
                         workArea.append(srcChars, pos, quoteLen);
 759  7
                         pos += quoteLen * 2;
 760  7
                         trimStart = workArea.size();
 761  7
                         continue;
 762  
                     }
 763  
                     
 764  
                     // end of quoting
 765  20
                     quoting = false;
 766  20
                     pos += quoteLen;
 767  20
                     continue;
 768  
                 }
 769  
                 
 770  
                 // copy regular character from inside quotes
 771  66
                 workArea.append(srcChars[pos++]);
 772  66
                 trimStart = workArea.size();
 773  
                 
 774  
             } else {
 775  
                 // Not in quoting mode
 776  
                 
 777  
                 // check for delimiter, and thus end of token
 778  352
                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
 779  352
                 if (delimLen > 0) {
 780  
                     // return condition when end of token found
 781  134
                     addToken(tokenList, workArea.substring(0, trimStart));
 782  134
                     return pos + delimLen;
 783  
                 }
 784  
                 
 785  
                 // check for quote, and thus back into quoting mode
 786  218
                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
 787  4
                     quoting = true;
 788  4
                     pos += quoteLen;
 789  4
                     continue;
 790  
                 }
 791  
                 
 792  
                 // check for ignored (outside quotes), and ignore
 793  214
                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
 794  214
                 if (ignoredLen > 0) {
 795  12
                     pos += ignoredLen;
 796  12
                     continue;
 797  
                 }
 798  
                 
 799  
                 // check for trimmed character
 800  
                 // don't yet know if its at the end, so copy to workArea
 801  
                 // use trimStart to keep track of trim at the end
 802  202
                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
 803  202
                 if (trimmedLen > 0) {
 804  17
                     workArea.append(srcChars, pos, trimmedLen);
 805  17
                     pos += trimmedLen;
 806  17
                     continue;
 807  
                 }
 808  
                 
 809  
                 // copy regular character from outside quotes
 810  185
                 workArea.append(srcChars[pos++]);
 811  185
                 trimStart = workArea.size();
 812  185
             }
 813  
         }
 814  
         
 815  
         // return condition when end of string found
 816  45
         addToken(tokenList, workArea.substring(0, trimStart));
 817  45
         return -1;
 818  
     }
 819  
 
 820  
     /**
 821  
      * Checks if the characters at the index specified match the quote
 822  
      * already matched in readNextToken().
 823  
      *
 824  
      * @param srcChars  the character array being tokenized
 825  
      * @param pos  the position to check for a quote
 826  
      * @param len  the length of the character array being tokenized
 827  
      * @param quoteStart  the start position of the matched quote, 0 if no quoting
 828  
      * @param quoteLen  the length of the matched quote, 0 if no quoting
 829  
      * @return true if a quote is matched
 830  
      */
 831  
     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
 832  173
         for (int i = 0; i < quoteLen; i++) {
 833  135
             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
 834  97
                 return false;
 835  
             }
 836  
         }
 837  38
         return true;
 838  
     }
 839  
 
 840  
     // Delimiter
 841  
     //-----------------------------------------------------------------------
 842  
     /**
 843  
      * Gets the field delimiter matcher.
 844  
      *
 845  
      * @return the delimiter matcher in use
 846  
      */
 847  
     public StrMatcher getDelimiterMatcher() {
 848  622
         return this.delimMatcher;
 849  
     }
 850  
 
 851  
     /**
 852  
      * Sets the field delimiter matcher.
 853  
      * <p>
 854  
      * The delimitier is used to separate one token from another.
 855  
      *
 856  
      * @param delim  the delimiter matcher to use
 857  
      * @return this, to enable chaining
 858  
      */
 859  
     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
 860  42
         if (delim == null) {
 861  1
             this.delimMatcher = StrMatcher.noneMatcher();
 862  
         } else {
 863  41
             this.delimMatcher = delim;
 864  
         }
 865  42
         return this;
 866  
     }
 867  
 
 868  
     /**
 869  
      * Sets the field delimiter character.
 870  
      *
 871  
      * @param delim  the delimiter character to use
 872  
      * @return this, to enable chaining
 873  
      */
 874  
     public StrTokenizer setDelimiterChar(final char delim) {
 875  36
         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
 876  
     }
 877  
 
 878  
     /**
 879  
      * Sets the field delimiter string.
 880  
      *
 881  
      * @param delim  the delimiter string to use
 882  
      * @return this, to enable chaining
 883  
      */
 884  
     public StrTokenizer setDelimiterString(final String delim) {
 885  1
         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
 886  
     }
 887  
 
 888  
     // Quote
 889  
     //-----------------------------------------------------------------------
 890  
     /**
 891  
      * Gets the quote matcher currently in use.
 892  
      * <p>
 893  
      * The quote character is used to wrap data between the tokens.
 894  
      * This enables delimiters to be entered as data.
 895  
      * The default value is '"' (double quote).
 896  
      *
 897  
      * @return the quote matcher in use
 898  
      */
 899  
     public StrMatcher getQuoteMatcher() {
 900  250
         return quoteMatcher;
 901  
     }
 902  
 
 903  
     /**
 904  
      * Set the quote matcher to use.
 905  
      * <p>
 906  
      * The quote character is used to wrap data between the tokens.
 907  
      * This enables delimiters to be entered as data.
 908  
      *
 909  
      * @param quote  the quote matcher to use, null ignored
 910  
      * @return this, to enable chaining
 911  
      */
 912  
     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
 913  28
         if (quote != null) {
 914  27
             this.quoteMatcher = quote;
 915  
         }
 916  28
         return this;
 917  
     }
 918  
 
 919  
     /**
 920  
      * Sets the quote character to use.
 921  
      * <p>
 922  
      * The quote character is used to wrap data between the tokens.
 923  
      * This enables delimiters to be entered as data.
 924  
      *
 925  
      * @param quote  the quote character to use
 926  
      * @return this, to enable chaining
 927  
      */
 928  
     public StrTokenizer setQuoteChar(final char quote) {
 929  21
         return setQuoteMatcher(StrMatcher.charMatcher(quote));
 930  
     }
 931  
 
 932  
     // Ignored
 933  
     //-----------------------------------------------------------------------
 934  
     /**
 935  
      * Gets the ignored character matcher.
 936  
      * <p>
 937  
      * These characters are ignored when parsing the String, unless they are
 938  
      * within a quoted region.
 939  
      * The default value is not to ignore anything.
 940  
      *
 941  
      * @return the ignored matcher in use
 942  
      */
 943  
     public StrMatcher getIgnoredMatcher() {
 944  480
         return ignoredMatcher;
 945  
     }
 946  
 
 947  
     /**
 948  
      * Set the matcher for characters to ignore.
 949  
      * <p>
 950  
      * These characters are ignored when parsing the String, unless they are
 951  
      * within a quoted region.
 952  
      *
 953  
      * @param ignored  the ignored matcher to use, null ignored
 954  
      * @return this, to enable chaining
 955  
      */
 956  
     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
 957  16
         if (ignored != null) {
 958  15
             this.ignoredMatcher = ignored;
 959  
         }
 960  16
         return this;
 961  
     }
 962  
 
 963  
     /**
 964  
      * Set the character to ignore.
 965  
      * <p>
 966  
      * This character is ignored when parsing the String, unless it is
 967  
      * within a quoted region.
 968  
      *
 969  
      * @param ignored  the ignored character to use
 970  
      * @return this, to enable chaining
 971  
      */
 972  
     public StrTokenizer setIgnoredChar(final char ignored) {
 973  1
         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
 974  
     }
 975  
 
 976  
     // Trimmer
 977  
     //-----------------------------------------------------------------------
 978  
     /**
 979  
      * Gets the trimmer character matcher.
 980  
      * <p>
 981  
      * These characters are trimmed off on each side of the delimiter
 982  
      * until the token or quote is found.
 983  
      * The default value is not to trim anything.
 984  
      *
 985  
      * @return the trimmer matcher in use
 986  
      */
 987  
     public StrMatcher getTrimmerMatcher() {
 988  468
         return trimmerMatcher;
 989  
     }
 990  
 
 991  
     /**
 992  
      * Sets the matcher for characters to trim.
 993  
      * <p>
 994  
      * These characters are trimmed off on each side of the delimiter
 995  
      * until the token or quote is found.
 996  
      *
 997  
      * @param trimmer  the trimmer matcher to use, null ignored
 998  
      * @return this, to enable chaining
 999  
      */
 1000  
     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
 1001  11
         if (trimmer != null) {
 1002  10
             this.trimmerMatcher = trimmer;
 1003  
         }
 1004  11
         return this;
 1005  
     }
 1006  
 
 1007  
     //-----------------------------------------------------------------------
 1008  
     /**
 1009  
      * Gets whether the tokenizer currently returns empty tokens as null.
 1010  
      * The default for this property is false.
 1011  
      *
 1012  
      * @return true if empty tokens are returned as null
 1013  
      */
 1014  
     public boolean isEmptyTokenAsNull() {
 1015  23
         return this.emptyAsNull;
 1016  
     }
 1017  
 
 1018  
     /**
 1019  
      * Sets whether the tokenizer should return empty tokens as null.
 1020  
      * The default for this property is false.
 1021  
      *
 1022  
      * @param emptyAsNull  whether empty tokens are returned as null
 1023  
      * @return this, to enable chaining
 1024  
      */
 1025  
     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
 1026  16
         this.emptyAsNull = emptyAsNull;
 1027  16
         return this;
 1028  
     }
 1029  
 
 1030  
     //-----------------------------------------------------------------------
 1031  
     /**
 1032  
      * Gets whether the tokenizer currently ignores empty tokens.
 1033  
      * The default for this property is true.
 1034  
      *
 1035  
      * @return true if empty tokens are not returned
 1036  
      */
 1037  
     public boolean isIgnoreEmptyTokens() {
 1038  35
         return ignoreEmptyTokens;
 1039  
     }
 1040  
 
 1041  
     /**
 1042  
      * Sets whether the tokenizer should ignore and not return empty tokens.
 1043  
      * The default for this property is true.
 1044  
      *
 1045  
      * @param ignoreEmptyTokens  whether empty tokens are not returned
 1046  
      * @return this, to enable chaining
 1047  
      */
 1048  
     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
 1049  24
         this.ignoreEmptyTokens = ignoreEmptyTokens;
 1050  24
         return this;
 1051  
     }
 1052  
 
 1053  
     //-----------------------------------------------------------------------
 1054  
     /**
 1055  
      * Gets the String content that the tokenizer is parsing.
 1056  
      *
 1057  
      * @return the string content being parsed
 1058  
      */
 1059  
     public String getContent() {
 1060  4
         if (chars == null) {
 1061  2
             return null;
 1062  
         }
 1063  2
         return new String(chars);
 1064  
     }
 1065  
 
 1066  
     //-----------------------------------------------------------------------
 1067  
     /**
 1068  
      * Creates a new instance of this Tokenizer. The new instance is reset so
 1069  
      * that it will be at the start of the token list.
 1070  
      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
 1071  
      * 
 1072  
      * @return a new instance of this Tokenizer which has been reset.
 1073  
      */
 1074  
     @Override
 1075  
     public Object clone() {
 1076  
         try {
 1077  45
             return cloneReset();
 1078  1
         } catch (final CloneNotSupportedException ex) {
 1079  1
             return null;
 1080  
         }
 1081  
     }
 1082  
 
 1083  
     /**
 1084  
      * Creates a new instance of this Tokenizer. The new instance is reset so that
 1085  
      * it will be at the start of the token list.
 1086  
      * 
 1087  
      * @return a new instance of this Tokenizer which has been reset.
 1088  
      * @throws CloneNotSupportedException if there is a problem cloning
 1089  
      */
 1090  
     Object cloneReset() throws CloneNotSupportedException {
 1091  
         // this method exists to enable 100% test coverage
 1092  44
         final StrTokenizer cloned = (StrTokenizer) super.clone();
 1093  44
         if (cloned.chars != null) {
 1094  1
             cloned.chars = cloned.chars.clone();
 1095  
         }
 1096  44
         cloned.reset();
 1097  44
         return cloned;
 1098  
     }
 1099  
 
 1100  
     //-----------------------------------------------------------------------
 1101  
     /**
 1102  
      * Gets the String content that the tokenizer is parsing.
 1103  
      *
 1104  
      * @return the string content being parsed
 1105  
      */
 1106  
     @Override
 1107  
     public String toString() {
 1108  2
         if (tokens == null) {
 1109  1
             return "StrTokenizer[not tokenized yet]";
 1110  
         }
 1111  1
         return "StrTokenizer" + getTokenList();
 1112  
     }
 1113  
 
 1114  
 }