Coverage Report - org.apache.commons.lang3.text.StrTokenizer
 
Classes in this File Line Coverage Branch Coverage Complexity
StrTokenizer
91%
226/246
97%
90/92
1,984
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.lang3.text;
 18  
 
 19  
 import java.util.ArrayList;
 20  
 import java.util.Collections;
 21  
 import java.util.List;
 22  
 import java.util.ListIterator;
 23  
 import java.util.NoSuchElementException;
 24  
 
 25  
 import org.apache.commons.lang3.ArrayUtils;
 26  
 import org.apache.commons.lang3.StringUtils;
 27  
 
 28  
 /**
 29  
  * Tokenizes a string based based on delimiters (separators)
 30  
  * and supporting quoting and ignored character concepts.
 31  
  * <p>
 32  
  * This class can split a String into many smaller strings. It aims
 33  
  * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
 34  
  * however it offers much more control and flexibility including implementing
 35  
  * the <code>ListIterator</code> interface. By default, it is set up
 36  
  * like <code>StringTokenizer</code>.
 37  
  * <p>
 38  
  * The input String is split into a number of <i>tokens</i>.
 39  
  * Each token is separated from the next String by a <i>delimiter</i>.
 40  
  * One or more delimiter characters must be specified.
 41  
  * <p>
 42  
  * Each token may be surrounded by quotes.
 43  
  * The <i>quote</i> matcher specifies the quote character(s).
 44  
  * A quote may be escaped within a quoted section by duplicating itself.
 45  
  * <p>
 46  
  * Between each token and the delimiter are potentially characters that need trimming.
 47  
  * The <i>trimmer</i> matcher specifies these characters.
 48  
  * One usage might be to trim whitespace characters.
 49  
  * <p>
 50  
  * At any point outside the quotes there might potentially be invalid characters.
 51  
  * The <i>ignored</i> matcher specifies these characters to be removed.
 52  
  * One usage might be to remove new line characters.
 53  
  * <p>
 54  
  * Empty tokens may be removed or returned as null.
 55  
  * <pre>
 56  
  * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
 57  
  * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
 58  
  * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
 59  
  * </pre>
 60  
  * <p>
 61  
  *
 62  
  * This tokenizer has the following properties and options:
 63  
  *
 64  
  * <table summary="Tokenizer Properties">
 65  
  *  <tr>
 66  
  *   <th>Property</th><th>Type</th><th>Default</th>
 67  
  *  </tr>
 68  
  *  <tr>
 69  
  *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
 70  
  *  </tr>
 71  
  *  <tr>
 72  
  *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
 73  
  *  </tr>
 74  
  *  <tr>
 75  
  *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
 76  
  *  </tr>
 77  
  *  <tr>
 78  
  *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
 79  
  *  </tr>
 80  
  *  <tr>
 81  
  *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
 82  
  *  </tr>
 83  
  * </table>
 84  
  *
 85  
  * @since 2.2
 86  
  */
 87  0
 public class StrTokenizer implements ListIterator<String>, Cloneable {
 88  
 
 89  
     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
 90  
     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
 91  
     static {
 92  1
         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
 93  1
         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
 94  1
         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
 95  1
         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
 96  1
         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
 97  1
         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
 98  1
         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
 99  
 
 100  1
         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
 101  1
         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
 102  1
         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
 103  1
         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
 104  1
         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
 105  1
         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
 106  1
         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
 107  1
     }
 108  
 
 109  
     /** The text to work on. */
 110  
     private char chars[];
 111  
     /** The parsed tokens */
 112  
     private String tokens[];
 113  
     /** The current iteration position */
 114  
     private int tokenPos;
 115  
 
 116  
     /** The delimiter matcher */
 117  67
     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
 118  
     /** The quote matcher */
 119  67
     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
 120  
     /** The ignored matcher */
 121  67
     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
 122  
     /** The trimmer matcher */
 123  67
     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
 124  
 
 125  
     /** Whether to return empty tokens as null */
 126  67
     private boolean emptyAsNull = false;
 127  
     /** Whether to ignore empty tokens */
 128  67
     private boolean ignoreEmptyTokens = true;
 129  
 
 130  
     //-----------------------------------------------------------------------
 131  
 
 132  
     /**
 133  
      * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
 134  
      * 
 135  
      * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
 136  
      */
 137  
     private static StrTokenizer getCSVClone() {
 138  26
         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
 139  
     }
 140  
 
 141  
     /**
 142  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 143  
      * initializing it with the given input.  The default for CSV processing
 144  
      * will be trim whitespace from both ends (which can be overridden with
 145  
      * the setTrimmer method).
 146  
      * <p>
 147  
      * You must call a "reset" method to set the string which you want to parse.
 148  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 149  
      */
 150  
     public static StrTokenizer getCSVInstance() {
 151  16
         return getCSVClone();
 152  
     }
 153  
 
 154  
     /**
 155  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 156  
      * initializing it with the given input.  The default for CSV processing
 157  
      * will be trim whitespace from both ends (which can be overridden with
 158  
      * the setTrimmer method).
 159  
      *
 160  
      * @param input  the text to parse
 161  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 162  
      */
 163  
     public static StrTokenizer getCSVInstance(final String input) {
 164  6
         final StrTokenizer tok = getCSVClone();
 165  6
         tok.reset(input);
 166  6
         return tok;
 167  
     }
 168  
 
 169  
     /**
 170  
      * Gets a new tokenizer instance which parses Comma Separated Value strings
 171  
      * initializing it with the given input.  The default for CSV processing
 172  
      * will be trim whitespace from both ends (which can be overridden with
 173  
      * the setTrimmer method).
 174  
      *
 175  
      * @param input  the text to parse
 176  
      * @return a new tokenizer instance which parses Comma Separated Value strings
 177  
      */
 178  
     public static StrTokenizer getCSVInstance(final char[] input) {
 179  4
         final StrTokenizer tok = getCSVClone();
 180  4
         tok.reset(input);
 181  4
         return tok;
 182  
     }
 183  
 
 184  
     /**
 185  
      * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
 186  
      * 
 187  
      * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
 188  
      */
 189  
     private static StrTokenizer getTSVClone() {
 190  16
         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
 191  
     }
 192  
 
 193  
 
 194  
     /**
 195  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 196  
      * The default for CSV processing will be trim whitespace from both ends
 197  
      * (which can be overridden with the setTrimmer method).
 198  
      * <p>
 199  
      * You must call a "reset" method to set the string which you want to parse.
 200  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 201  
      */
 202  
     public static StrTokenizer getTSVInstance() {
 203  14
         return getTSVClone();
 204  
     }
 205  
 
 206  
     /**
 207  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 208  
      * The default for CSV processing will be trim whitespace from both ends
 209  
      * (which can be overridden with the setTrimmer method).
 210  
      * @param input  the string to parse
 211  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 212  
      */
 213  
     public static StrTokenizer getTSVInstance(final String input) {
 214  1
         final StrTokenizer tok = getTSVClone();
 215  1
         tok.reset(input);
 216  1
         return tok;
 217  
     }
 218  
 
 219  
     /**
 220  
      * Gets a new tokenizer instance which parses Tab Separated Value strings.
 221  
      * The default for CSV processing will be trim whitespace from both ends
 222  
      * (which can be overridden with the setTrimmer method).
 223  
      * @param input  the string to parse
 224  
      * @return a new tokenizer instance which parses Tab Separated Value strings.
 225  
      */
 226  
     public static StrTokenizer getTSVInstance(final char[] input) {
 227  1
         final StrTokenizer tok = getTSVClone();
 228  1
         tok.reset(input);
 229  1
         return tok;
 230  
     }
 231  
 
 232  
     //-----------------------------------------------------------------------
 233  
     /**
 234  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 235  
      * as per StringTokenizer, but with no text to tokenize.
 236  
      * <p>
 237  
      * This constructor is normally used with {@link #reset(String)}.
 238  
      */
 239  
     public StrTokenizer() {
 240  6
         super();
 241  6
         this.chars = null;
 242  6
     }
 243  
 
 244  
     /**
 245  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 246  
      * as per StringTokenizer.
 247  
      *
 248  
      * @param input  the string which is to be parsed
 249  
      */
 250  
     public StrTokenizer(final String input) {
 251  49
         super();
 252  49
         if (input != null) {
 253  46
             chars = input.toCharArray();
 254  
         } else {
 255  3
             chars = null;
 256  
         }
 257  49
     }
 258  
 
 259  
     /**
 260  
      * Constructs a tokenizer splitting on the specified delimiter character.
 261  
      *
 262  
      * @param input  the string which is to be parsed
 263  
      * @param delim  the field delimiter character
 264  
      */
 265  
     public StrTokenizer(final String input, final char delim) {
 266  23
         this(input);
 267  23
         setDelimiterChar(delim);
 268  23
     }
 269  
 
 270  
     /**
 271  
      * Constructs a tokenizer splitting on the specified delimiter string.
 272  
      *
 273  
      * @param input  the string which is to be parsed
 274  
      * @param delim  the field delimiter string
 275  
      */
 276  
     public StrTokenizer(final String input, final String delim) {
 277  0
         this(input);
 278  0
         setDelimiterString(delim);
 279  0
     }
 280  
 
 281  
     /**
 282  
      * Constructs a tokenizer splitting using the specified delimiter matcher.
 283  
      *
 284  
      * @param input  the string which is to be parsed
 285  
      * @param delim  the field delimiter matcher
 286  
      */
 287  
     public StrTokenizer(final String input, final StrMatcher delim) {
 288  0
         this(input);
 289  0
         setDelimiterMatcher(delim);
 290  0
     }
 291  
 
 292  
     /**
 293  
      * Constructs a tokenizer splitting on the specified delimiter character
 294  
      * and handling quotes using the specified quote character.
 295  
      *
 296  
      * @param input  the string which is to be parsed
 297  
      * @param delim  the field delimiter character
 298  
      * @param quote  the field quoted string character
 299  
      */
 300  
     public StrTokenizer(final String input, final char delim, final char quote) {
 301  11
         this(input, delim);
 302  11
         setQuoteChar(quote);
 303  11
     }
 304  
 
 305  
     /**
 306  
      * Constructs a tokenizer splitting using the specified delimiter matcher
 307  
      * and handling quotes using the specified quote matcher.
 308  
      *
 309  
      * @param input  the string which is to be parsed
 310  
      * @param delim  the field delimiter matcher
 311  
      * @param quote  the field quoted string matcher
 312  
      */
 313  
     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
 314  0
         this(input, delim);
 315  0
         setQuoteMatcher(quote);
 316  0
     }
 317  
 
 318  
     /**
 319  
      * Constructs a tokenizer splitting on space, tab, newline and formfeed
 320  
      * as per StringTokenizer.
 321  
      *
 322  
      * @param input  the string which is to be parsed, not cloned
 323  
      */
 324  
     public StrTokenizer(final char[] input) {
 325  12
         super();
 326  12
         this.chars = ArrayUtils.clone(input);
 327  12
     }
 328  
 
 329  
     /**
 330  
      * Constructs a tokenizer splitting on the specified character.
 331  
      *
 332  
      * @param input  the string which is to be parsed, not cloned
 333  
      * @param delim the field delimiter character
 334  
      */
 335  
     public StrTokenizer(final char[] input, final char delim) {
 336  6
         this(input);
 337  6
         setDelimiterChar(delim);
 338  6
     }
 339  
 
 340  
     /**
 341  
      * Constructs a tokenizer splitting on the specified string.
 342  
      *
 343  
      * @param input  the string which is to be parsed, not cloned
 344  
      * @param delim the field delimiter string
 345  
      */
 346  
     public StrTokenizer(final char[] input, final String delim) {
 347  0
         this(input);
 348  0
         setDelimiterString(delim);
 349  0
     }
 350  
 
 351  
     /**
 352  
      * Constructs a tokenizer splitting using the specified delimiter matcher.
 353  
      *
 354  
      * @param input  the string which is to be parsed, not cloned
 355  
      * @param delim  the field delimiter matcher
 356  
      */
 357  
     public StrTokenizer(final char[] input, final StrMatcher delim) {
 358  0
         this(input);
 359  0
         setDelimiterMatcher(delim);
 360  0
     }
 361  
 
 362  
     /**
 363  
      * Constructs a tokenizer splitting on the specified delimiter character
 364  
      * and handling quotes using the specified quote character.
 365  
      *
 366  
      * @param input  the string which is to be parsed, not cloned
 367  
      * @param delim  the field delimiter character
 368  
      * @param quote  the field quoted string character
 369  
      */
 370  
     public StrTokenizer(final char[] input, final char delim, final char quote) {
 371  3
         this(input, delim);
 372  3
         setQuoteChar(quote);
 373  3
     }
 374  
 
 375  
     /**
 376  
      * Constructs a tokenizer splitting using the specified delimiter matcher
 377  
      * and handling quotes using the specified quote matcher.
 378  
      *
 379  
      * @param input  the string which is to be parsed, not cloned
 380  
      * @param delim  the field delimiter character
 381  
      * @param quote  the field quoted string character
 382  
      */
 383  
     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
 384  0
         this(input, delim);
 385  0
         setQuoteMatcher(quote);
 386  0
     }
 387  
 
 388  
     // API
 389  
     //-----------------------------------------------------------------------
 390  
     /**
 391  
      * Gets the number of tokens found in the String.
 392  
      *
 393  
      * @return the number of matched tokens
 394  
      */
 395  
     public int size() {
 396  17
         checkTokenized();
 397  17
         return tokens.length;
 398  
     }
 399  
 
 400  
     /**
 401  
      * Gets the next token from the String.
 402  
      * Equivalent to {@link #next()} except it returns null rather than
 403  
      * throwing {@link NoSuchElementException} when no tokens remain.
 404  
      *
 405  
      * @return the next sequential token, or null when no more tokens are found
 406  
      */
 407  
     public String nextToken() {
 408  52
         if (hasNext()) {
 409  34
             return tokens[tokenPos++];
 410  
         }
 411  18
         return null;
 412  
     }
 413  
 
 414  
     /**
 415  
      * Gets the previous token from the String.
 416  
      *
 417  
      * @return the previous sequential token, or null when no more tokens are found
 418  
      */
 419  
     public String previousToken() {
 420  50
         if (hasPrevious()) {
 421  30
             return tokens[--tokenPos];
 422  
         }
 423  20
         return null;
 424  
     }
 425  
 
 426  
     /**
 427  
      * Gets a copy of the full token list as an independent modifiable array.
 428  
      *
 429  
      * @return the tokens as a String array
 430  
      */
 431  
     public String[] getTokenArray() {
 432  12
         checkTokenized();
 433  12
         return tokens.clone();
 434  
     }
 435  
 
 436  
     /**
 437  
      * Gets a copy of the full token list as an independent modifiable list.
 438  
      *
 439  
      * @return the tokens as a String array
 440  
      */
 441  
     public List<String> getTokenList() {
 442  2
         checkTokenized();
 443  2
         final List<String> list = new ArrayList<String>(tokens.length);
 444  10
         for (final String element : tokens) {
 445  8
             list.add(element);
 446  
         }
 447  2
         return list;
 448  
     }
 449  
 
 450  
     /**
 451  
      * Resets this tokenizer, forgetting all parsing and iteration already completed.
 452  
      * <p>
 453  
      * This method allows the same tokenizer to be reused for the same String.
 454  
      *
 455  
      * @return this, to enable chaining
 456  
      */
 457  
     public StrTokenizer reset() {
 458  69
         tokenPos = 0;
 459  69
         tokens = null;
 460  69
         return this;
 461  
     }
 462  
 
 463  
     /**
 464  
      * Reset this tokenizer, giving it a new input string to parse.
 465  
      * In this manner you can re-use a tokenizer with the same settings
 466  
      * on multiple input lines.
 467  
      *
 468  
      * @param input  the new string to tokenize, null sets no text to parse
 469  
      * @return this, to enable chaining
 470  
      */
 471  
     public StrTokenizer reset(final String input) {
 472  10
         reset();
 473  10
         if (input != null) {
 474  9
             this.chars = input.toCharArray();
 475  
         } else {
 476  1
             this.chars = null;
 477  
         }
 478  10
         return this;
 479  
     }
 480  
 
 481  
     /**
 482  
      * Reset this tokenizer, giving it a new input string to parse.
 483  
      * In this manner you can re-use a tokenizer with the same settings
 484  
      * on multiple input lines.
 485  
      *
 486  
      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
 487  
      * @return this, to enable chaining
 488  
      */
 489  
     public StrTokenizer reset(final char[] input) {
 490  10
         reset();
 491  10
         this.chars = ArrayUtils.clone(input);
 492  10
         return this;
 493  
     }
 494  
 
 495  
     // ListIterator
 496  
     //-----------------------------------------------------------------------
 497  
     /**
 498  
      * Checks whether there are any more tokens.
 499  
      *
 500  
      * @return true if there are more tokens
 501  
      */
 502  
     @Override
 503  
     public boolean hasNext() {
 504  233
         checkTokenized();
 505  233
         return tokenPos < tokens.length;
 506  
     }
 507  
 
 508  
     /**
 509  
      * Gets the next token.
 510  
      *
 511  
      * @return the next String token
 512  
      * @throws NoSuchElementException if there are no more elements
 513  
      */
 514  
     @Override
 515  
     public String next() {
 516  116
         if (hasNext()) {
 517  111
             return tokens[tokenPos++];
 518  
         }
 519  5
         throw new NoSuchElementException();
 520  
     }
 521  
 
 522  
     /**
 523  
      * Gets the index of the next token to return.
 524  
      *
 525  
      * @return the next token index
 526  
      */
 527  
     @Override
 528  
     public int nextIndex() {
 529  90
         return tokenPos;
 530  
     }
 531  
 
 532  
     /**
 533  
      * Checks whether there are any previous tokens that can be iterated to.
 534  
      *
 535  
      * @return true if there are previous tokens
 536  
      */
 537  
     @Override
 538  
     public boolean hasPrevious() {
 539  77
         checkTokenized();
 540  77
         return tokenPos > 0;
 541  
     }
 542  
 
 543  
     /**
 544  
      * Gets the token previous to the last returned token.
 545  
      *
 546  
      * @return the previous token
 547  
      */
 548  
     @Override
 549  
     public String previous() {
 550  9
         if (hasPrevious()) {
 551  8
             return tokens[--tokenPos];
 552  
         }
 553  1
         throw new NoSuchElementException();
 554  
     }
 555  
 
 556  
     /**
 557  
      * Gets the index of the previous token.
 558  
      *
 559  
      * @return the previous token index
 560  
      */
 561  
     @Override
 562  
     public int previousIndex() {
 563  20
         return tokenPos - 1;
 564  
     }
 565  
 
 566  
     /**
 567  
      * Unsupported ListIterator operation.
 568  
      *
 569  
      * @throws UnsupportedOperationException always
 570  
      */
 571  
     @Override
 572  
     public void remove() {
 573  1
         throw new UnsupportedOperationException("remove() is unsupported");
 574  
     }
 575  
 
 576  
     /**
 577  
      * Unsupported ListIterator operation.
 578  
      * @param obj this parameter ignored.
 579  
      * @throws UnsupportedOperationException always
 580  
      */
 581  
     @Override
 582  
     public void set(final String obj) {
 583  1
         throw new UnsupportedOperationException("set() is unsupported");
 584  
     }
 585  
 
 586  
     /**
 587  
      * Unsupported ListIterator operation.
 588  
      * @param obj this parameter ignored.
 589  
      * @throws UnsupportedOperationException always
 590  
      */
 591  
     @Override
 592  
     public void add(final String obj) {
 593  1
         throw new UnsupportedOperationException("add() is unsupported");
 594  
     }
 595  
 
 596  
     // Implementation
 597  
     //-----------------------------------------------------------------------
 598  
     /**
 599  
      * Checks if tokenization has been done, and if not then do it.
 600  
      */
 601  
     private void checkTokenized() {
 602  341
         if (tokens == null) {
 603  84
             if (chars == null) {
 604  
                 // still call tokenize as subclass may do some work
 605  16
                 final List<String> split = tokenize(null, 0, 0);
 606  16
                 tokens = split.toArray(new String[split.size()]);
 607  16
             } else {
 608  68
                 final List<String> split = tokenize(chars, 0, chars.length);
 609  68
                 tokens = split.toArray(new String[split.size()]);
 610  
             }
 611  
         }
 612  341
     }
 613  
 
 614  
     /**
 615  
      * Internal method to performs the tokenization.
 616  
      * <p>
 617  
      * Most users of this class do not need to call this method. This method
 618  
      * will be called automatically by other (public) methods when required.
 619  
      * <p>
 620  
      * This method exists to allow subclasses to add code before or after the
 621  
      * tokenization. For example, a subclass could alter the character array,
 622  
      * offset or count to be parsed, or call the tokenizer multiple times on
 623  
      * multiple strings. It is also be possible to filter the results.
 624  
      * <p>
 625  
      * <code>StrTokenizer</code> will always pass a zero offset and a count
 626  
      * equal to the length of the array to this method, however a subclass
 627  
      * may pass other values, or even an entirely different array.
 628  
      * 
 629  
      * @param srcChars  the character array being tokenized, may be null
 630  
      * @param offset  the start position within the character array, must be valid
 631  
      * @param count  the number of characters to tokenize, must be valid
 632  
      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
 633  
      */
 634  
     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
 635  84
         if (srcChars == null || count == 0) {
 636  22
             return Collections.emptyList();
 637  
         }
 638  62
         final StrBuilder buf = new StrBuilder();
 639  62
         final List<String> tokenList = new ArrayList<String>();
 640  62
         int pos = offset;
 641  
         
 642  
         // loop around the entire buffer
 643  264
         while (pos >= 0 && pos < count) {
 644  
             // find next token
 645  202
             pos = readNextToken(srcChars, pos, count, buf, tokenList);
 646  
             
 647  
             // handle case where end of string is a delimiter
 648  202
             if (pos >= count) {
 649  12
                 addToken(tokenList, "");
 650  
             }
 651  
         }
 652  62
         return tokenList;
 653  
     }
 654  
 
 655  
     /**
 656  
      * Adds a token to a list, paying attention to the parameters we've set.
 657  
      *
 658  
      * @param list  the list to add to
 659  
      * @param tok  the token to add
 660  
      */
 661  
     private void addToken(final List<String> list, String tok) {
 662  214
         if (StringUtils.isEmpty(tok)) {
 663  35
             if (isIgnoreEmptyTokens()) {
 664  12
                 return;
 665  
             }
 666  23
             if (isEmptyTokenAsNull()) {
 667  11
                 tok = null;
 668  
             }
 669  
         }
 670  202
         list.add(tok);
 671  202
     }
 672  
 
 673  
     /**
 674  
      * Reads character by character through the String to get the next token.
 675  
      *
 676  
      * @param srcChars  the character array being tokenized
 677  
      * @param start  the first character of field
 678  
      * @param len  the length of the character array being tokenized
 679  
      * @param workArea  a temporary work area
 680  
      * @param tokenList  the list of parsed tokens
 681  
      * @return the starting position of the next field (the character
 682  
      *  immediately after the delimiter), or -1 if end of string found
 683  
      */
 684  
     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
 685  
         // skip all leading whitespace, unless it is the
 686  
         // field delimiter or the quote character
 687  271
         while (start < len) {
 688  266
             final int removeLen = Math.max(
 689  
                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
 690  
                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
 691  266
             if (removeLen == 0 ||
 692  
                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
 693  
                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
 694  0
                 break;
 695  
             }
 696  69
             start += removeLen;
 697  69
         }
 698  
         
 699  
         // handle reaching end
 700  202
         if (start >= len) {
 701  5
             addToken(tokenList, "");
 702  5
             return -1;
 703  
         }
 704  
         
 705  
         // handle empty token
 706  197
         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
 707  197
         if (delimLen > 0) {
 708  18
             addToken(tokenList, "");
 709  18
             return start + delimLen;
 710  
         }
 711  
         
 712  
         // handle found token
 713  179
         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
 714  179
         if (quoteLen > 0) {
 715  17
             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
 716  
         }
 717  162
         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
 718  
     }
 719  
 
 720  
     /**
 721  
      * Reads a possibly quoted string token.
 722  
      *
 723  
      * @param srcChars  the character array being tokenized
 724  
      * @param start  the first character of field
 725  
      * @param len  the length of the character array being tokenized
 726  
      * @param workArea  a temporary work area
 727  
      * @param tokenList  the list of parsed tokens
 728  
      * @param quoteStart  the start position of the matched quote, 0 if no quoting
 729  
      * @param quoteLen  the length of the matched quote, 0 if no quoting
 730  
      * @return the starting position of the next field (the character
 731  
      *  immediately after the delimiter, or if end of string found,
 732  
      *  then the length of string
 733  
      */
 734  
     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 
 735  
                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
 736  
         // Loop until we've found the end of the quoted
 737  
         // string or the end of the input
 738  179
         workArea.clear();
 739  179
         int pos = start;
 740  179
         boolean quoting = quoteLen > 0;
 741  179
         int trimStart = 0;
 742  
         
 743  490
         while (pos < len) {
 744  
             // quoting mode can occur several times throughout a string
 745  
             // we must switch between quoting and non-quoting until we
 746  
             // encounter a non-quoted delimiter, or end of string
 747  445
             if (quoting) {
 748  
                 // In quoting mode
 749  
                 
 750  
                 // If we've found a quote character, see if it's
 751  
                 // followed by a second quote.  If so, then we need
 752  
                 // to actually put the quote character into the token
 753  
                 // rather than end the token.
 754  93
                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
 755  27
                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
 756  
                         // matched pair of quotes, thus an escaped quote
 757  7
                         workArea.append(srcChars, pos, quoteLen);
 758  7
                         pos += quoteLen * 2;
 759  7
                         trimStart = workArea.size();
 760  7
                         continue;
 761  
                     }
 762  
                     
 763  
                     // end of quoting
 764  20
                     quoting = false;
 765  20
                     pos += quoteLen;
 766  20
                     continue;
 767  
                 }
 768  
                 
 769  
                 // copy regular character from inside quotes
 770  66
                 workArea.append(srcChars[pos++]);
 771  66
                 trimStart = workArea.size();
 772  
                 
 773  
             } else {
 774  
                 // Not in quoting mode
 775  
                 
 776  
                 // check for delimiter, and thus end of token
 777  352
                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
 778  352
                 if (delimLen > 0) {
 779  
                     // return condition when end of token found
 780  134
                     addToken(tokenList, workArea.substring(0, trimStart));
 781  134
                     return pos + delimLen;
 782  
                 }
 783  
                 
 784  
                 // check for quote, and thus back into quoting mode
 785  218
                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
 786  4
                     quoting = true;
 787  4
                     pos += quoteLen;
 788  4
                     continue;
 789  
                 }
 790  
                 
 791  
                 // check for ignored (outside quotes), and ignore
 792  214
                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
 793  214
                 if (ignoredLen > 0) {
 794  12
                     pos += ignoredLen;
 795  12
                     continue;
 796  
                 }
 797  
                 
 798  
                 // check for trimmed character
 799  
                 // don't yet know if its at the end, so copy to workArea
 800  
                 // use trimStart to keep track of trim at the end
 801  202
                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
 802  202
                 if (trimmedLen > 0) {
 803  17
                     workArea.append(srcChars, pos, trimmedLen);
 804  17
                     pos += trimmedLen;
 805  17
                     continue;
 806  
                 }
 807  
                 
 808  
                 // copy regular character from outside quotes
 809  185
                 workArea.append(srcChars[pos++]);
 810  185
                 trimStart = workArea.size();
 811  185
             }
 812  
         }
 813  
         
 814  
         // return condition when end of string found
 815  45
         addToken(tokenList, workArea.substring(0, trimStart));
 816  45
         return -1;
 817  
     }
 818  
 
 819  
     /**
 820  
      * Checks if the characters at the index specified match the quote
 821  
      * already matched in readNextToken().
 822  
      *
 823  
      * @param srcChars  the character array being tokenized
 824  
      * @param pos  the position to check for a quote
 825  
      * @param len  the length of the character array being tokenized
 826  
      * @param quoteStart  the start position of the matched quote, 0 if no quoting
 827  
      * @param quoteLen  the length of the matched quote, 0 if no quoting
 828  
      * @return true if a quote is matched
 829  
      */
 830  
     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
 831  173
         for (int i = 0; i < quoteLen; i++) {
 832  135
             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
 833  97
                 return false;
 834  
             }
 835  
         }
 836  38
         return true;
 837  
     }
 838  
 
 839  
     // Delimiter
 840  
     //-----------------------------------------------------------------------
 841  
     /**
 842  
      * Gets the field delimiter matcher.
 843  
      *
 844  
      * @return the delimiter matcher in use
 845  
      */
 846  
     public StrMatcher getDelimiterMatcher() {
 847  622
         return this.delimMatcher;
 848  
     }
 849  
 
 850  
     /**
 851  
      * Sets the field delimiter matcher.
 852  
      * <p>
 853  
      * The delimitier is used to separate one token from another.
 854  
      *
 855  
      * @param delim  the delimiter matcher to use
 856  
      * @return this, to enable chaining
 857  
      */
 858  
     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
 859  42
         if (delim == null) {
 860  1
             this.delimMatcher = StrMatcher.noneMatcher();
 861  
         } else {
 862  41
             this.delimMatcher = delim;
 863  
         }
 864  42
         return this;
 865  
     }
 866  
 
 867  
     /**
 868  
      * Sets the field delimiter character.
 869  
      *
 870  
      * @param delim  the delimiter character to use
 871  
      * @return this, to enable chaining
 872  
      */
 873  
     public StrTokenizer setDelimiterChar(final char delim) {
 874  36
         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
 875  
     }
 876  
 
 877  
     /**
 878  
      * Sets the field delimiter string.
 879  
      *
 880  
      * @param delim  the delimiter string to use
 881  
      * @return this, to enable chaining
 882  
      */
 883  
     public StrTokenizer setDelimiterString(final String delim) {
 884  1
         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
 885  
     }
 886  
 
 887  
     // Quote
 888  
     //-----------------------------------------------------------------------
 889  
     /**
 890  
      * Gets the quote matcher currently in use.
 891  
      * <p>
 892  
      * The quote character is used to wrap data between the tokens.
 893  
      * This enables delimiters to be entered as data.
 894  
      * The default value is '"' (double quote).
 895  
      *
 896  
      * @return the quote matcher in use
 897  
      */
 898  
     public StrMatcher getQuoteMatcher() {
 899  250
         return quoteMatcher;
 900  
     }
 901  
 
 902  
     /**
 903  
      * Set the quote matcher to use.
 904  
      * <p>
 905  
      * The quote character is used to wrap data between the tokens.
 906  
      * This enables delimiters to be entered as data.
 907  
      *
 908  
      * @param quote  the quote matcher to use, null ignored
 909  
      * @return this, to enable chaining
 910  
      */
 911  
     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
 912  28
         if (quote != null) {
 913  27
             this.quoteMatcher = quote;
 914  
         }
 915  28
         return this;
 916  
     }
 917  
 
 918  
     /**
 919  
      * Sets the quote character to use.
 920  
      * <p>
 921  
      * The quote character is used to wrap data between the tokens.
 922  
      * This enables delimiters to be entered as data.
 923  
      *
 924  
      * @param quote  the quote character to use
 925  
      * @return this, to enable chaining
 926  
      */
 927  
     public StrTokenizer setQuoteChar(final char quote) {
 928  21
         return setQuoteMatcher(StrMatcher.charMatcher(quote));
 929  
     }
 930  
 
 931  
     // Ignored
 932  
     //-----------------------------------------------------------------------
 933  
     /**
 934  
      * Gets the ignored character matcher.
 935  
      * <p>
 936  
      * These characters are ignored when parsing the String, unless they are
 937  
      * within a quoted region.
 938  
      * The default value is not to ignore anything.
 939  
      *
 940  
      * @return the ignored matcher in use
 941  
      */
 942  
     public StrMatcher getIgnoredMatcher() {
 943  480
         return ignoredMatcher;
 944  
     }
 945  
 
 946  
     /**
 947  
      * Set the matcher for characters to ignore.
 948  
      * <p>
 949  
      * These characters are ignored when parsing the String, unless they are
 950  
      * within a quoted region.
 951  
      *
 952  
      * @param ignored  the ignored matcher to use, null ignored
 953  
      * @return this, to enable chaining
 954  
      */
 955  
     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
 956  16
         if (ignored != null) {
 957  15
             this.ignoredMatcher = ignored;
 958  
         }
 959  16
         return this;
 960  
     }
 961  
 
 962  
     /**
 963  
      * Set the character to ignore.
 964  
      * <p>
 965  
      * This character is ignored when parsing the String, unless it is
 966  
      * within a quoted region.
 967  
      *
 968  
      * @param ignored  the ignored character to use
 969  
      * @return this, to enable chaining
 970  
      */
 971  
     public StrTokenizer setIgnoredChar(final char ignored) {
 972  1
         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
 973  
     }
 974  
 
 975  
     // Trimmer
 976  
     //-----------------------------------------------------------------------
 977  
     /**
 978  
      * Gets the trimmer character matcher.
 979  
      * <p>
 980  
      * These characters are trimmed off on each side of the delimiter
 981  
      * until the token or quote is found.
 982  
      * The default value is not to trim anything.
 983  
      *
 984  
      * @return the trimmer matcher in use
 985  
      */
 986  
     public StrMatcher getTrimmerMatcher() {
 987  468
         return trimmerMatcher;
 988  
     }
 989  
 
 990  
     /**
 991  
      * Sets the matcher for characters to trim.
 992  
      * <p>
 993  
      * These characters are trimmed off on each side of the delimiter
 994  
      * until the token or quote is found.
 995  
      *
 996  
      * @param trimmer  the trimmer matcher to use, null ignored
 997  
      * @return this, to enable chaining
 998  
      */
 999  
     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
 1000  11
         if (trimmer != null) {
 1001  10
             this.trimmerMatcher = trimmer;
 1002  
         }
 1003  11
         return this;
 1004  
     }
 1005  
 
 1006  
     //-----------------------------------------------------------------------
 1007  
     /**
 1008  
      * Gets whether the tokenizer currently returns empty tokens as null.
 1009  
      * The default for this property is false.
 1010  
      *
 1011  
      * @return true if empty tokens are returned as null
 1012  
      */
 1013  
     public boolean isEmptyTokenAsNull() {
 1014  23
         return this.emptyAsNull;
 1015  
     }
 1016  
 
 1017  
     /**
 1018  
      * Sets whether the tokenizer should return empty tokens as null.
 1019  
      * The default for this property is false.
 1020  
      *
 1021  
      * @param emptyAsNull  whether empty tokens are returned as null
 1022  
      * @return this, to enable chaining
 1023  
      */
 1024  
     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
 1025  16
         this.emptyAsNull = emptyAsNull;
 1026  16
         return this;
 1027  
     }
 1028  
 
 1029  
     //-----------------------------------------------------------------------
 1030  
     /**
 1031  
      * Gets whether the tokenizer currently ignores empty tokens.
 1032  
      * The default for this property is true.
 1033  
      *
 1034  
      * @return true if empty tokens are not returned
 1035  
      */
 1036  
     public boolean isIgnoreEmptyTokens() {
 1037  35
         return ignoreEmptyTokens;
 1038  
     }
 1039  
 
 1040  
     /**
 1041  
      * Sets whether the tokenizer should ignore and not return empty tokens.
 1042  
      * The default for this property is true.
 1043  
      *
 1044  
      * @param ignoreEmptyTokens  whether empty tokens are not returned
 1045  
      * @return this, to enable chaining
 1046  
      */
 1047  
     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
 1048  24
         this.ignoreEmptyTokens = ignoreEmptyTokens;
 1049  24
         return this;
 1050  
     }
 1051  
 
 1052  
     //-----------------------------------------------------------------------
 1053  
     /**
 1054  
      * Gets the String content that the tokenizer is parsing.
 1055  
      *
 1056  
      * @return the string content being parsed
 1057  
      */
 1058  
     public String getContent() {
 1059  4
         if (chars == null) {
 1060  2
             return null;
 1061  
         }
 1062  2
         return new String(chars);
 1063  
     }
 1064  
 
 1065  
     //-----------------------------------------------------------------------
 1066  
     /**
 1067  
      * Creates a new instance of this Tokenizer. The new instance is reset so
 1068  
      * that it will be at the start of the token list.
 1069  
      * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
 1070  
      * 
 1071  
      * @return a new instance of this Tokenizer which has been reset.
 1072  
      */
 1073  
     @Override
 1074  
     public Object clone() {
 1075  
         try {
 1076  45
             return cloneReset();
 1077  1
         } catch (final CloneNotSupportedException ex) {
 1078  1
             return null;
 1079  
         }
 1080  
     }
 1081  
 
 1082  
     /**
 1083  
      * Creates a new instance of this Tokenizer. The new instance is reset so that
 1084  
      * it will be at the start of the token list.
 1085  
      * 
 1086  
      * @return a new instance of this Tokenizer which has been reset.
 1087  
      * @throws CloneNotSupportedException if there is a problem cloning
 1088  
      */
 1089  
     Object cloneReset() throws CloneNotSupportedException {
 1090  
         // this method exists to enable 100% test coverage
 1091  44
         final StrTokenizer cloned = (StrTokenizer) super.clone();
 1092  44
         if (cloned.chars != null) {
 1093  1
             cloned.chars = cloned.chars.clone();
 1094  
         }
 1095  44
         cloned.reset();
 1096  44
         return cloned;
 1097  
     }
 1098  
 
 1099  
     //-----------------------------------------------------------------------
 1100  
     /**
 1101  
      * Gets the String content that the tokenizer is parsing.
 1102  
      *
 1103  
      * @return the string content being parsed
 1104  
      */
 1105  
     @Override
 1106  
     public String toString() {
 1107  2
         if (tokens == null) {
 1108  1
             return "StrTokenizer[not tokenized yet]";
 1109  
         }
 1110  1
         return "StrTokenizer" + getTokenList();
 1111  
     }
 1112  
 
 1113  
 }