001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025 026import org.apache.commons.lang3.ArrayUtils; 027import org.apache.commons.lang3.StringUtils; 028 029/** 030 * Tokenizes a string based on delimiters (separators) 031 * and supporting quoting and ignored character concepts. 032 * <p> 033 * This class can split a String into many smaller strings. It aims 034 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 035 * however it offers much more control and flexibility including implementing 036 * the <code>ListIterator</code> interface. By default, it is set up 037 * like <code>StringTokenizer</code>. 038 * <p> 039 * The input String is split into a number of <i>tokens</i>. 040 * Each token is separated from the next String by a <i>delimiter</i>. 041 * One or more delimiter characters must be specified. 042 * <p> 043 * Each token may be surrounded by quotes. 044 * The <i>quote</i> matcher specifies the quote character(s). 045 * A quote may be escaped within a quoted section by duplicating itself. 046 * <p> 047 * Between each token and the delimiter are potentially characters that need trimming. 048 * The <i>trimmer</i> matcher specifies these characters. 049 * One usage might be to trim whitespace characters. 050 * <p> 051 * At any point outside the quotes there might potentially be invalid characters. 052 * The <i>ignored</i> matcher specifies these characters to be removed. 053 * One usage might be to remove new line characters. 054 * <p> 055 * Empty tokens may be removed or returned as null. 056 * <pre> 057 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 058 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 059 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 060 * </pre> 061 * <p> 062 * 063 * This tokenizer has the following properties and options: 064 * 065 * <table summary="Tokenizer Properties"> 066 * <tr> 067 * <th>Property</th><th>Type</th><th>Default</th> 068 * </tr> 069 * <tr> 070 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 071 * </tr> 072 * <tr> 073 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 074 * </tr> 075 * <tr> 076 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 077 * </tr> 078 * <tr> 079 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 080 * </tr> 081 * <tr> 082 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 083 * </tr> 084 * </table> 085 * 086 * @since 2.2 087 * @deprecated as of 3.6, use commons-text 088 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StrTokenizer.html"> 089 * StrTokenizer</a> instead 090 */ 091@Deprecated 092public class StrTokenizer implements ListIterator<String>, Cloneable { 093 094 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 095 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 096 static { 097 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 098 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 099 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 101 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 102 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 103 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 104 105 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 106 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 107 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 109 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 110 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 111 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 112 } 113 114 /** The text to work on. */ 115 private char chars[]; 116 /** The parsed tokens */ 117 private String tokens[]; 118 /** The current iteration position */ 119 private int tokenPos; 120 121 /** The delimiter matcher */ 122 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 123 /** The quote matcher */ 124 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 125 /** The ignored matcher */ 126 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 127 /** The trimmer matcher */ 128 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 129 130 /** Whether to return empty tokens as null */ 131 private boolean emptyAsNull = false; 132 /** Whether to ignore empty tokens */ 133 private boolean ignoreEmptyTokens = true; 134 135 //----------------------------------------------------------------------- 136 137 /** 138 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 139 * 140 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 141 */ 142 private static StrTokenizer getCSVClone() { 143 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 144 } 145 146 /** 147 * Gets a new tokenizer instance which parses Comma Separated Value strings 148 * initializing it with the given input. The default for CSV processing 149 * will be trim whitespace from both ends (which can be overridden with 150 * the setTrimmer method). 151 * <p> 152 * You must call a "reset" method to set the string which you want to parse. 153 * @return a new tokenizer instance which parses Comma Separated Value strings 154 */ 155 public static StrTokenizer getCSVInstance() { 156 return getCSVClone(); 157 } 158 159 /** 160 * Gets a new tokenizer instance which parses Comma Separated Value strings 161 * initializing it with the given input. The default for CSV processing 162 * will be trim whitespace from both ends (which can be overridden with 163 * the setTrimmer method). 164 * 165 * @param input the text to parse 166 * @return a new tokenizer instance which parses Comma Separated Value strings 167 */ 168 public static StrTokenizer getCSVInstance(final String input) { 169 final StrTokenizer tok = getCSVClone(); 170 tok.reset(input); 171 return tok; 172 } 173 174 /** 175 * Gets a new tokenizer instance which parses Comma Separated Value strings 176 * initializing it with the given input. The default for CSV processing 177 * will be trim whitespace from both ends (which can be overridden with 178 * the setTrimmer method). 179 * 180 * @param input the text to parse 181 * @return a new tokenizer instance which parses Comma Separated Value strings 182 */ 183 public static StrTokenizer getCSVInstance(final char[] input) { 184 final StrTokenizer tok = getCSVClone(); 185 tok.reset(input); 186 return tok; 187 } 188 189 /** 190 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 191 * 192 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 193 */ 194 private static StrTokenizer getTSVClone() { 195 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 196 } 197 198 199 /** 200 * Gets a new tokenizer instance which parses Tab Separated Value strings. 201 * The default for CSV processing will be trim whitespace from both ends 202 * (which can be overridden with the setTrimmer method). 203 * <p> 204 * You must call a "reset" method to set the string which you want to parse. 205 * @return a new tokenizer instance which parses Tab Separated Value strings. 206 */ 207 public static StrTokenizer getTSVInstance() { 208 return getTSVClone(); 209 } 210 211 /** 212 * Gets a new tokenizer instance which parses Tab Separated Value strings. 213 * The default for CSV processing will be trim whitespace from both ends 214 * (which can be overridden with the setTrimmer method). 215 * @param input the string to parse 216 * @return a new tokenizer instance which parses Tab Separated Value strings. 217 */ 218 public static StrTokenizer getTSVInstance(final String input) { 219 final StrTokenizer tok = getTSVClone(); 220 tok.reset(input); 221 return tok; 222 } 223 224 /** 225 * Gets a new tokenizer instance which parses Tab Separated Value strings. 226 * The default for CSV processing will be trim whitespace from both ends 227 * (which can be overridden with the setTrimmer method). 228 * @param input the string to parse 229 * @return a new tokenizer instance which parses Tab Separated Value strings. 230 */ 231 public static StrTokenizer getTSVInstance(final char[] input) { 232 final StrTokenizer tok = getTSVClone(); 233 tok.reset(input); 234 return tok; 235 } 236 237 //----------------------------------------------------------------------- 238 /** 239 * Constructs a tokenizer splitting on space, tab, newline and formfeed 240 * as per StringTokenizer, but with no text to tokenize. 241 * <p> 242 * This constructor is normally used with {@link #reset(String)}. 243 */ 244 public StrTokenizer() { 245 super(); 246 this.chars = null; 247 } 248 249 /** 250 * Constructs a tokenizer splitting on space, tab, newline and formfeed 251 * as per StringTokenizer. 252 * 253 * @param input the string which is to be parsed 254 */ 255 public StrTokenizer(final String input) { 256 super(); 257 if (input != null) { 258 chars = input.toCharArray(); 259 } else { 260 chars = null; 261 } 262 } 263 264 /** 265 * Constructs a tokenizer splitting on the specified delimiter character. 266 * 267 * @param input the string which is to be parsed 268 * @param delim the field delimiter character 269 */ 270 public StrTokenizer(final String input, final char delim) { 271 this(input); 272 setDelimiterChar(delim); 273 } 274 275 /** 276 * Constructs a tokenizer splitting on the specified delimiter string. 277 * 278 * @param input the string which is to be parsed 279 * @param delim the field delimiter string 280 */ 281 public StrTokenizer(final String input, final String delim) { 282 this(input); 283 setDelimiterString(delim); 284 } 285 286 /** 287 * Constructs a tokenizer splitting using the specified delimiter matcher. 288 * 289 * @param input the string which is to be parsed 290 * @param delim the field delimiter matcher 291 */ 292 public StrTokenizer(final String input, final StrMatcher delim) { 293 this(input); 294 setDelimiterMatcher(delim); 295 } 296 297 /** 298 * Constructs a tokenizer splitting on the specified delimiter character 299 * and handling quotes using the specified quote character. 300 * 301 * @param input the string which is to be parsed 302 * @param delim the field delimiter character 303 * @param quote the field quoted string character 304 */ 305 public StrTokenizer(final String input, final char delim, final char quote) { 306 this(input, delim); 307 setQuoteChar(quote); 308 } 309 310 /** 311 * Constructs a tokenizer splitting using the specified delimiter matcher 312 * and handling quotes using the specified quote matcher. 313 * 314 * @param input the string which is to be parsed 315 * @param delim the field delimiter matcher 316 * @param quote the field quoted string matcher 317 */ 318 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 319 this(input, delim); 320 setQuoteMatcher(quote); 321 } 322 323 /** 324 * Constructs a tokenizer splitting on space, tab, newline and formfeed 325 * as per StringTokenizer. 326 * 327 * @param input the string which is to be parsed, not cloned 328 */ 329 public StrTokenizer(final char[] input) { 330 super(); 331 this.chars = ArrayUtils.clone(input); 332 } 333 334 /** 335 * Constructs a tokenizer splitting on the specified character. 336 * 337 * @param input the string which is to be parsed, not cloned 338 * @param delim the field delimiter character 339 */ 340 public StrTokenizer(final char[] input, final char delim) { 341 this(input); 342 setDelimiterChar(delim); 343 } 344 345 /** 346 * Constructs a tokenizer splitting on the specified string. 347 * 348 * @param input the string which is to be parsed, not cloned 349 * @param delim the field delimiter string 350 */ 351 public StrTokenizer(final char[] input, final String delim) { 352 this(input); 353 setDelimiterString(delim); 354 } 355 356 /** 357 * Constructs a tokenizer splitting using the specified delimiter matcher. 358 * 359 * @param input the string which is to be parsed, not cloned 360 * @param delim the field delimiter matcher 361 */ 362 public StrTokenizer(final char[] input, final StrMatcher delim) { 363 this(input); 364 setDelimiterMatcher(delim); 365 } 366 367 /** 368 * Constructs a tokenizer splitting on the specified delimiter character 369 * and handling quotes using the specified quote character. 370 * 371 * @param input the string which is to be parsed, not cloned 372 * @param delim the field delimiter character 373 * @param quote the field quoted string character 374 */ 375 public StrTokenizer(final char[] input, final char delim, final char quote) { 376 this(input, delim); 377 setQuoteChar(quote); 378 } 379 380 /** 381 * Constructs a tokenizer splitting using the specified delimiter matcher 382 * and handling quotes using the specified quote matcher. 383 * 384 * @param input the string which is to be parsed, not cloned 385 * @param delim the field delimiter character 386 * @param quote the field quoted string character 387 */ 388 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 389 this(input, delim); 390 setQuoteMatcher(quote); 391 } 392 393 // API 394 //----------------------------------------------------------------------- 395 /** 396 * Gets the number of tokens found in the String. 397 * 398 * @return the number of matched tokens 399 */ 400 public int size() { 401 checkTokenized(); 402 return tokens.length; 403 } 404 405 /** 406 * Gets the next token from the String. 407 * Equivalent to {@link #next()} except it returns null rather than 408 * throwing {@link NoSuchElementException} when no tokens remain. 409 * 410 * @return the next sequential token, or null when no more tokens are found 411 */ 412 public String nextToken() { 413 if (hasNext()) { 414 return tokens[tokenPos++]; 415 } 416 return null; 417 } 418 419 /** 420 * Gets the previous token from the String. 421 * 422 * @return the previous sequential token, or null when no more tokens are found 423 */ 424 public String previousToken() { 425 if (hasPrevious()) { 426 return tokens[--tokenPos]; 427 } 428 return null; 429 } 430 431 /** 432 * Gets a copy of the full token list as an independent modifiable array. 433 * 434 * @return the tokens as a String array 435 */ 436 public String[] getTokenArray() { 437 checkTokenized(); 438 return tokens.clone(); 439 } 440 441 /** 442 * Gets a copy of the full token list as an independent modifiable list. 443 * 444 * @return the tokens as a String array 445 */ 446 public List<String> getTokenList() { 447 checkTokenized(); 448 final List<String> list = new ArrayList<>(tokens.length); 449 list.addAll(Arrays.asList(tokens)); 450 return list; 451 } 452 453 /** 454 * Resets this tokenizer, forgetting all parsing and iteration already completed. 455 * <p> 456 * This method allows the same tokenizer to be reused for the same String. 457 * 458 * @return this, to enable chaining 459 */ 460 public StrTokenizer reset() { 461 tokenPos = 0; 462 tokens = null; 463 return this; 464 } 465 466 /** 467 * Reset this tokenizer, giving it a new input string to parse. 468 * In this manner you can re-use a tokenizer with the same settings 469 * on multiple input lines. 470 * 471 * @param input the new string to tokenize, null sets no text to parse 472 * @return this, to enable chaining 473 */ 474 public StrTokenizer reset(final String input) { 475 reset(); 476 if (input != null) { 477 this.chars = input.toCharArray(); 478 } else { 479 this.chars = null; 480 } 481 return this; 482 } 483 484 /** 485 * Reset this tokenizer, giving it a new input string to parse. 486 * In this manner you can re-use a tokenizer with the same settings 487 * on multiple input lines. 488 * 489 * @param input the new character array to tokenize, not cloned, null sets no text to parse 490 * @return this, to enable chaining 491 */ 492 public StrTokenizer reset(final char[] input) { 493 reset(); 494 this.chars = ArrayUtils.clone(input); 495 return this; 496 } 497 498 // ListIterator 499 //----------------------------------------------------------------------- 500 /** 501 * Checks whether there are any more tokens. 502 * 503 * @return true if there are more tokens 504 */ 505 @Override 506 public boolean hasNext() { 507 checkTokenized(); 508 return tokenPos < tokens.length; 509 } 510 511 /** 512 * Gets the next token. 513 * 514 * @return the next String token 515 * @throws NoSuchElementException if there are no more elements 516 */ 517 @Override 518 public String next() { 519 if (hasNext()) { 520 return tokens[tokenPos++]; 521 } 522 throw new NoSuchElementException(); 523 } 524 525 /** 526 * Gets the index of the next token to return. 527 * 528 * @return the next token index 529 */ 530 @Override 531 public int nextIndex() { 532 return tokenPos; 533 } 534 535 /** 536 * Checks whether there are any previous tokens that can be iterated to. 537 * 538 * @return true if there are previous tokens 539 */ 540 @Override 541 public boolean hasPrevious() { 542 checkTokenized(); 543 return tokenPos > 0; 544 } 545 546 /** 547 * Gets the token previous to the last returned token. 548 * 549 * @return the previous token 550 */ 551 @Override 552 public String previous() { 553 if (hasPrevious()) { 554 return tokens[--tokenPos]; 555 } 556 throw new NoSuchElementException(); 557 } 558 559 /** 560 * Gets the index of the previous token. 561 * 562 * @return the previous token index 563 */ 564 @Override 565 public int previousIndex() { 566 return tokenPos - 1; 567 } 568 569 /** 570 * Unsupported ListIterator operation. 571 * 572 * @throws UnsupportedOperationException always 573 */ 574 @Override 575 public void remove() { 576 throw new UnsupportedOperationException("remove() is unsupported"); 577 } 578 579 /** 580 * Unsupported ListIterator operation. 581 * @param obj this parameter ignored. 582 * @throws UnsupportedOperationException always 583 */ 584 @Override 585 public void set(final String obj) { 586 throw new UnsupportedOperationException("set() is unsupported"); 587 } 588 589 /** 590 * Unsupported ListIterator operation. 591 * @param obj this parameter ignored. 592 * @throws UnsupportedOperationException always 593 */ 594 @Override 595 public void add(final String obj) { 596 throw new UnsupportedOperationException("add() is unsupported"); 597 } 598 599 // Implementation 600 //----------------------------------------------------------------------- 601 /** 602 * Checks if tokenization has been done, and if not then do it. 603 */ 604 private void checkTokenized() { 605 if (tokens == null) { 606 if (chars == null) { 607 // still call tokenize as subclass may do some work 608 final List<String> split = tokenize(null, 0, 0); 609 tokens = split.toArray(new String[split.size()]); 610 } else { 611 final List<String> split = tokenize(chars, 0, chars.length); 612 tokens = split.toArray(new String[split.size()]); 613 } 614 } 615 } 616 617 /** 618 * Internal method to performs the tokenization. 619 * <p> 620 * Most users of this class do not need to call this method. This method 621 * will be called automatically by other (public) methods when required. 622 * <p> 623 * This method exists to allow subclasses to add code before or after the 624 * tokenization. For example, a subclass could alter the character array, 625 * offset or count to be parsed, or call the tokenizer multiple times on 626 * multiple strings. It is also be possible to filter the results. 627 * <p> 628 * <code>StrTokenizer</code> will always pass a zero offset and a count 629 * equal to the length of the array to this method, however a subclass 630 * may pass other values, or even an entirely different array. 631 * 632 * @param srcChars the character array being tokenized, may be null 633 * @param offset the start position within the character array, must be valid 634 * @param count the number of characters to tokenize, must be valid 635 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 636 */ 637 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 638 if (srcChars == null || count == 0) { 639 return Collections.emptyList(); 640 } 641 final StrBuilder buf = new StrBuilder(); 642 final List<String> tokenList = new ArrayList<>(); 643 int pos = offset; 644 645 // loop around the entire buffer 646 while (pos >= 0 && pos < count) { 647 // find next token 648 pos = readNextToken(srcChars, pos, count, buf, tokenList); 649 650 // handle case where end of string is a delimiter 651 if (pos >= count) { 652 addToken(tokenList, StringUtils.EMPTY); 653 } 654 } 655 return tokenList; 656 } 657 658 /** 659 * Adds a token to a list, paying attention to the parameters we've set. 660 * 661 * @param list the list to add to 662 * @param tok the token to add 663 */ 664 private void addToken(final List<String> list, String tok) { 665 if (StringUtils.isEmpty(tok)) { 666 if (isIgnoreEmptyTokens()) { 667 return; 668 } 669 if (isEmptyTokenAsNull()) { 670 tok = null; 671 } 672 } 673 list.add(tok); 674 } 675 676 /** 677 * Reads character by character through the String to get the next token. 678 * 679 * @param srcChars the character array being tokenized 680 * @param start the first character of field 681 * @param len the length of the character array being tokenized 682 * @param workArea a temporary work area 683 * @param tokenList the list of parsed tokens 684 * @return the starting position of the next field (the character 685 * immediately after the delimiter), or -1 if end of string found 686 */ 687 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 688 // skip all leading whitespace, unless it is the 689 // field delimiter or the quote character 690 while (start < len) { 691 final int removeLen = Math.max( 692 getIgnoredMatcher().isMatch(srcChars, start, start, len), 693 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 694 if (removeLen == 0 || 695 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 696 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 697 break; 698 } 699 start += removeLen; 700 } 701 702 // handle reaching end 703 if (start >= len) { 704 addToken(tokenList, StringUtils.EMPTY); 705 return -1; 706 } 707 708 // handle empty token 709 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 710 if (delimLen > 0) { 711 addToken(tokenList, StringUtils.EMPTY); 712 return start + delimLen; 713 } 714 715 // handle found token 716 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 717 if (quoteLen > 0) { 718 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 719 } 720 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 721 } 722 723 /** 724 * Reads a possibly quoted string token. 725 * 726 * @param srcChars the character array being tokenized 727 * @param start the first character of field 728 * @param len the length of the character array being tokenized 729 * @param workArea a temporary work area 730 * @param tokenList the list of parsed tokens 731 * @param quoteStart the start position of the matched quote, 0 if no quoting 732 * @param quoteLen the length of the matched quote, 0 if no quoting 733 * @return the starting position of the next field (the character 734 * immediately after the delimiter, or if end of string found, 735 * then the length of string 736 */ 737 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 738 final List<String> tokenList, final int quoteStart, final int quoteLen) { 739 // Loop until we've found the end of the quoted 740 // string or the end of the input 741 workArea.clear(); 742 int pos = start; 743 boolean quoting = quoteLen > 0; 744 int trimStart = 0; 745 746 while (pos < len) { 747 // quoting mode can occur several times throughout a string 748 // we must switch between quoting and non-quoting until we 749 // encounter a non-quoted delimiter, or end of string 750 if (quoting) { 751 // In quoting mode 752 753 // If we've found a quote character, see if it's 754 // followed by a second quote. If so, then we need 755 // to actually put the quote character into the token 756 // rather than end the token. 757 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 758 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 759 // matched pair of quotes, thus an escaped quote 760 workArea.append(srcChars, pos, quoteLen); 761 pos += quoteLen * 2; 762 trimStart = workArea.size(); 763 continue; 764 } 765 766 // end of quoting 767 quoting = false; 768 pos += quoteLen; 769 continue; 770 } 771 772 // copy regular character from inside quotes 773 workArea.append(srcChars[pos++]); 774 trimStart = workArea.size(); 775 776 } else { 777 // Not in quoting mode 778 779 // check for delimiter, and thus end of token 780 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 781 if (delimLen > 0) { 782 // return condition when end of token found 783 addToken(tokenList, workArea.substring(0, trimStart)); 784 return pos + delimLen; 785 } 786 787 // check for quote, and thus back into quoting mode 788 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 789 quoting = true; 790 pos += quoteLen; 791 continue; 792 } 793 794 // check for ignored (outside quotes), and ignore 795 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 796 if (ignoredLen > 0) { 797 pos += ignoredLen; 798 continue; 799 } 800 801 // check for trimmed character 802 // don't yet know if its at the end, so copy to workArea 803 // use trimStart to keep track of trim at the end 804 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 805 if (trimmedLen > 0) { 806 workArea.append(srcChars, pos, trimmedLen); 807 pos += trimmedLen; 808 continue; 809 } 810 811 // copy regular character from outside quotes 812 workArea.append(srcChars[pos++]); 813 trimStart = workArea.size(); 814 } 815 } 816 817 // return condition when end of string found 818 addToken(tokenList, workArea.substring(0, trimStart)); 819 return -1; 820 } 821 822 /** 823 * Checks if the characters at the index specified match the quote 824 * already matched in readNextToken(). 825 * 826 * @param srcChars the character array being tokenized 827 * @param pos the position to check for a quote 828 * @param len the length of the character array being tokenized 829 * @param quoteStart the start position of the matched quote, 0 if no quoting 830 * @param quoteLen the length of the matched quote, 0 if no quoting 831 * @return true if a quote is matched 832 */ 833 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 834 for (int i = 0; i < quoteLen; i++) { 835 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 836 return false; 837 } 838 } 839 return true; 840 } 841 842 // Delimiter 843 //----------------------------------------------------------------------- 844 /** 845 * Gets the field delimiter matcher. 846 * 847 * @return the delimiter matcher in use 848 */ 849 public StrMatcher getDelimiterMatcher() { 850 return this.delimMatcher; 851 } 852 853 /** 854 * Sets the field delimiter matcher. 855 * <p> 856 * The delimiter is used to separate one token from another. 857 * 858 * @param delim the delimiter matcher to use 859 * @return this, to enable chaining 860 */ 861 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 862 if (delim == null) { 863 this.delimMatcher = StrMatcher.noneMatcher(); 864 } else { 865 this.delimMatcher = delim; 866 } 867 return this; 868 } 869 870 /** 871 * Sets the field delimiter character. 872 * 873 * @param delim the delimiter character to use 874 * @return this, to enable chaining 875 */ 876 public StrTokenizer setDelimiterChar(final char delim) { 877 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 878 } 879 880 /** 881 * Sets the field delimiter string. 882 * 883 * @param delim the delimiter string to use 884 * @return this, to enable chaining 885 */ 886 public StrTokenizer setDelimiterString(final String delim) { 887 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 888 } 889 890 // Quote 891 //----------------------------------------------------------------------- 892 /** 893 * Gets the quote matcher currently in use. 894 * <p> 895 * The quote character is used to wrap data between the tokens. 896 * This enables delimiters to be entered as data. 897 * The default value is '"' (double quote). 898 * 899 * @return the quote matcher in use 900 */ 901 public StrMatcher getQuoteMatcher() { 902 return quoteMatcher; 903 } 904 905 /** 906 * Set the quote matcher to use. 907 * <p> 908 * The quote character is used to wrap data between the tokens. 909 * This enables delimiters to be entered as data. 910 * 911 * @param quote the quote matcher to use, null ignored 912 * @return this, to enable chaining 913 */ 914 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 915 if (quote != null) { 916 this.quoteMatcher = quote; 917 } 918 return this; 919 } 920 921 /** 922 * Sets the quote character to use. 923 * <p> 924 * The quote character is used to wrap data between the tokens. 925 * This enables delimiters to be entered as data. 926 * 927 * @param quote the quote character to use 928 * @return this, to enable chaining 929 */ 930 public StrTokenizer setQuoteChar(final char quote) { 931 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 932 } 933 934 // Ignored 935 //----------------------------------------------------------------------- 936 /** 937 * Gets the ignored character matcher. 938 * <p> 939 * These characters are ignored when parsing the String, unless they are 940 * within a quoted region. 941 * The default value is not to ignore anything. 942 * 943 * @return the ignored matcher in use 944 */ 945 public StrMatcher getIgnoredMatcher() { 946 return ignoredMatcher; 947 } 948 949 /** 950 * Set the matcher for characters to ignore. 951 * <p> 952 * These characters are ignored when parsing the String, unless they are 953 * within a quoted region. 954 * 955 * @param ignored the ignored matcher to use, null ignored 956 * @return this, to enable chaining 957 */ 958 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 959 if (ignored != null) { 960 this.ignoredMatcher = ignored; 961 } 962 return this; 963 } 964 965 /** 966 * Set the character to ignore. 967 * <p> 968 * This character is ignored when parsing the String, unless it is 969 * within a quoted region. 970 * 971 * @param ignored the ignored character to use 972 * @return this, to enable chaining 973 */ 974 public StrTokenizer setIgnoredChar(final char ignored) { 975 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 976 } 977 978 // Trimmer 979 //----------------------------------------------------------------------- 980 /** 981 * Gets the trimmer character matcher. 982 * <p> 983 * These characters are trimmed off on each side of the delimiter 984 * until the token or quote is found. 985 * The default value is not to trim anything. 986 * 987 * @return the trimmer matcher in use 988 */ 989 public StrMatcher getTrimmerMatcher() { 990 return trimmerMatcher; 991 } 992 993 /** 994 * Sets the matcher for characters to trim. 995 * <p> 996 * These characters are trimmed off on each side of the delimiter 997 * until the token or quote is found. 998 * 999 * @param trimmer the trimmer matcher to use, null ignored 1000 * @return this, to enable chaining 1001 */ 1002 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1003 if (trimmer != null) { 1004 this.trimmerMatcher = trimmer; 1005 } 1006 return this; 1007 } 1008 1009 //----------------------------------------------------------------------- 1010 /** 1011 * Gets whether the tokenizer currently returns empty tokens as null. 1012 * The default for this property is false. 1013 * 1014 * @return true if empty tokens are returned as null 1015 */ 1016 public boolean isEmptyTokenAsNull() { 1017 return this.emptyAsNull; 1018 } 1019 1020 /** 1021 * Sets whether the tokenizer should return empty tokens as null. 1022 * The default for this property is false. 1023 * 1024 * @param emptyAsNull whether empty tokens are returned as null 1025 * @return this, to enable chaining 1026 */ 1027 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1028 this.emptyAsNull = emptyAsNull; 1029 return this; 1030 } 1031 1032 //----------------------------------------------------------------------- 1033 /** 1034 * Gets whether the tokenizer currently ignores empty tokens. 1035 * The default for this property is true. 1036 * 1037 * @return true if empty tokens are not returned 1038 */ 1039 public boolean isIgnoreEmptyTokens() { 1040 return ignoreEmptyTokens; 1041 } 1042 1043 /** 1044 * Sets whether the tokenizer should ignore and not return empty tokens. 1045 * The default for this property is true. 1046 * 1047 * @param ignoreEmptyTokens whether empty tokens are not returned 1048 * @return this, to enable chaining 1049 */ 1050 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1051 this.ignoreEmptyTokens = ignoreEmptyTokens; 1052 return this; 1053 } 1054 1055 //----------------------------------------------------------------------- 1056 /** 1057 * Gets the String content that the tokenizer is parsing. 1058 * 1059 * @return the string content being parsed 1060 */ 1061 public String getContent() { 1062 if (chars == null) { 1063 return null; 1064 } 1065 return new String(chars); 1066 } 1067 1068 //----------------------------------------------------------------------- 1069 /** 1070 * Creates a new instance of this Tokenizer. The new instance is reset so 1071 * that it will be at the start of the token list. 1072 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1073 * 1074 * @return a new instance of this Tokenizer which has been reset. 1075 */ 1076 @Override 1077 public Object clone() { 1078 try { 1079 return cloneReset(); 1080 } catch (final CloneNotSupportedException ex) { 1081 return null; 1082 } 1083 } 1084 1085 /** 1086 * Creates a new instance of this Tokenizer. The new instance is reset so that 1087 * it will be at the start of the token list. 1088 * 1089 * @return a new instance of this Tokenizer which has been reset. 1090 * @throws CloneNotSupportedException if there is a problem cloning 1091 */ 1092 Object cloneReset() throws CloneNotSupportedException { 1093 // this method exists to enable 100% test coverage 1094 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1095 if (cloned.chars != null) { 1096 cloned.chars = cloned.chars.clone(); 1097 } 1098 cloned.reset(); 1099 return cloned; 1100 } 1101 1102 //----------------------------------------------------------------------- 1103 /** 1104 * Gets the String content that the tokenizer is parsing. 1105 * 1106 * @return the string content being parsed 1107 */ 1108 @Override 1109 public String toString() { 1110 if (tokens == null) { 1111 return "StrTokenizer[not tokenized yet]"; 1112 } 1113 return "StrTokenizer" + getTokenList(); 1114 } 1115 1116}