001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025 026import org.apache.commons.lang3.ArrayUtils; 027import org.apache.commons.lang3.StringUtils; 028import org.apache.commons.text.matcher.StringMatcher; 029import org.apache.commons.text.matcher.StringMatcherFactory; 030 031/** 032 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts. 033 * <p> 034 * This class can split a String into many smaller strings. It aims to do a similar job to 035 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including 036 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}. 037 * <p> 038 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a 039 * <em>delimiter</em>. One or more delimiter characters must be specified. 040 * <p> 041 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be 042 * escaped within a quoted section by duplicating itself. 043 * <p> 044 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher 045 * specifies these characters. One usage might be to trim whitespace characters. 046 * <p> 047 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies 048 * these characters to be removed. One usage might be to remove new line characters. 049 * <p> 050 * Empty tokens may be removed or returned as null. 051 * 052 * <pre> 053 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 054 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 056 * </pre> 057 * 058 * <table> 059 * <caption>StringTokenizer properties and options</caption> 060 * <tr> 061 * <th>Property</th> 062 * <th>Type</th> 063 * <th>Default</th> 064 * </tr> 065 * <tr> 066 * <td>delim</td> 067 * <td>CharSetMatcher</td> 068 * <td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td> 072 * <td>NoneMatcher</td> 073 * <td>{}</td> 074 * </tr> 075 * <tr> 076 * <td>ignore</td> 077 * <td>NoneMatcher</td> 078 * <td>{}</td> 079 * </tr> 080 * <tr> 081 * <td>emptyTokenAsNull</td> 082 * <td>boolean</td> 083 * <td>false</td> 084 * </tr> 085 * <tr> 086 * <td>ignoreEmptyTokens</td> 087 * <td>boolean</td> 088 * <td>true</td> 089 * </tr> 090 * </table> 091 * 092 * @since 1.3 093 */ 094public class StringTokenizer implements ListIterator<String>, Cloneable { 095 096 /** Comma separated values tokenizer internal variable. */ 097 // @formatter:off 098 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer() 099 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher()) 100 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()) 101 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()) 102 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()) 103 .setEmptyTokenAsNull(false) 104 .setIgnoreEmptyTokens(false); 105 // @formatter:on 106 107 /** Tab separated values tokenizer internal variable. */ 108 // @formatter:off 109 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer() 110 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher()) 111 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()) 112 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()) 113 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()) 114 .setEmptyTokenAsNull(false) 115 .setIgnoreEmptyTokens(false); 116 // @formatter:on 117 118 /** 119 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 120 * 121 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 122 */ 123 private static StringTokenizer getCSVClone() { 124 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 125 } 126 127 /** 128 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 129 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 130 * setTrimmer method). 131 * <p> 132 * You must call a "reset" method to set the string which you want to parse. 133 * </p> 134 * 135 * @return a new tokenizer instance which parses Comma Separated Value strings 136 */ 137 public static StringTokenizer getCSVInstance() { 138 return getCSVClone(); 139 } 140 141 /** 142 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 143 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 144 * setTrimmer method). 145 * 146 * @param input 147 * the text to parse 148 * @return a new tokenizer instance which parses Comma Separated Value strings 149 */ 150 public static StringTokenizer getCSVInstance(final char[] input) { 151 return getCSVClone().reset(input); 152 } 153 154 /** 155 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 156 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 157 * setTrimmer method). 158 * 159 * @param input 160 * the text to parse 161 * @return a new tokenizer instance which parses Comma Separated Value strings 162 */ 163 public static StringTokenizer getCSVInstance(final String input) { 164 return getCSVClone().reset(input); 165 } 166 167 /** 168 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 169 * 170 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 171 */ 172 private static StringTokenizer getTSVClone() { 173 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 174 } 175 176 /** 177 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 178 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 179 * <p> 180 * You must call a "reset" method to set the string which you want to parse. 181 * </p> 182 * 183 * @return a new tokenizer instance which parses Tab Separated Value strings. 184 */ 185 public static StringTokenizer getTSVInstance() { 186 return getTSVClone(); 187 } 188 189 /** 190 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 191 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 192 * 193 * @param input 194 * the string to parse 195 * @return a new tokenizer instance which parses Tab Separated Value strings. 196 */ 197 public static StringTokenizer getTSVInstance(final char[] input) { 198 return getTSVClone().reset(input); 199 } 200 201 /** 202 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 203 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 204 * 205 * @param input 206 * the string to parse 207 * @return a new tokenizer instance which parses Tab Separated Value strings. 208 */ 209 public static StringTokenizer getTSVInstance(final String input) { 210 return getTSVClone().reset(input); 211 } 212 213 /** The text to work on. */ 214 private char[] chars; 215 216 /** The parsed tokens. */ 217 private String[] tokens; 218 219 /** The current iteration position. */ 220 private int tokenPos; 221 222 /** The delimiter matcher. */ 223 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher(); 224 225 /** The quote matcher. */ 226 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 227 228 /** The ignored matcher. */ 229 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 230 231 /** The trimmer matcher. */ 232 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 233 234 /** Whether to return empty tokens as null. */ 235 private boolean emptyAsNull; 236 237 /** Whether to ignore empty tokens. */ 238 private boolean ignoreEmptyTokens = true; 239 240 /** 241 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to 242 * tokenize. 243 * <p> 244 * This constructor is normally used with {@link #reset(String)}. 245 * </p> 246 */ 247 public StringTokenizer() { 248 this.chars = null; 249 } 250 251 /** 252 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 253 * 254 * @param input 255 * the string which is to be parsed, not cloned 256 */ 257 public StringTokenizer(final char[] input) { 258 this.chars = input != null ? input.clone() : null; 259 } 260 261 /** 262 * Constructs a tokenizer splitting on the specified character. 263 * 264 * @param input 265 * the string which is to be parsed, not cloned 266 * @param delim 267 * the field delimiter character 268 */ 269 public StringTokenizer(final char[] input, final char delim) { 270 this(input); 271 setDelimiterChar(delim); 272 } 273 274 /** 275 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 276 * quote character. 277 * 278 * @param input 279 * the string which is to be parsed, not cloned 280 * @param delim 281 * the field delimiter character 282 * @param quote 283 * the field quoted string character 284 */ 285 public StringTokenizer(final char[] input, final char delim, final char quote) { 286 this(input, delim); 287 setQuoteChar(quote); 288 } 289 290 /** 291 * Constructs a tokenizer splitting on the specified string. 292 * 293 * @param input 294 * the string which is to be parsed, not cloned 295 * @param delim 296 * the field delimiter string 297 */ 298 public StringTokenizer(final char[] input, final String delim) { 299 this(input); 300 setDelimiterString(delim); 301 } 302 303 /** 304 * Constructs a tokenizer splitting using the specified delimiter matcher. 305 * 306 * @param input 307 * the string which is to be parsed, not cloned 308 * @param delim 309 * the field delimiter matcher 310 */ 311 public StringTokenizer(final char[] input, final StringMatcher delim) { 312 this(input); 313 setDelimiterMatcher(delim); 314 } 315 316 /** 317 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 318 * quote matcher. 319 * 320 * @param input 321 * the string which is to be parsed, not cloned 322 * @param delim 323 * the field delimiter character 324 * @param quote 325 * the field quoted string character 326 */ 327 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) { 328 this(input, delim); 329 setQuoteMatcher(quote); 330 } 331 332 /** 333 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 334 * 335 * @param input 336 * the string which is to be parsed 337 */ 338 public StringTokenizer(final String input) { 339 this.chars = input != null ? input.toCharArray() : null; 340 } 341 342 /** 343 * Constructs a tokenizer splitting on the specified delimiter character. 344 * 345 * @param input 346 * the string which is to be parsed 347 * @param delim 348 * the field delimiter character 349 */ 350 public StringTokenizer(final String input, final char delim) { 351 this(input); 352 setDelimiterChar(delim); 353 } 354 355 /** 356 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 357 * quote character. 358 * 359 * @param input 360 * the string which is to be parsed 361 * @param delim 362 * the field delimiter character 363 * @param quote 364 * the field quoted string character 365 */ 366 public StringTokenizer(final String input, final char delim, final char quote) { 367 this(input, delim); 368 setQuoteChar(quote); 369 } 370 371 /** 372 * Constructs a tokenizer splitting on the specified delimiter string. 373 * 374 * @param input 375 * the string which is to be parsed 376 * @param delim 377 * the field delimiter string 378 */ 379 public StringTokenizer(final String input, final String delim) { 380 this(input); 381 setDelimiterString(delim); 382 } 383 384 /** 385 * Constructs a tokenizer splitting using the specified delimiter matcher. 386 * 387 * @param input 388 * the string which is to be parsed 389 * @param delim 390 * the field delimiter matcher 391 */ 392 public StringTokenizer(final String input, final StringMatcher delim) { 393 this(input); 394 setDelimiterMatcher(delim); 395 } 396 397 /** 398 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 399 * quote matcher. 400 * 401 * @param input 402 * the string which is to be parsed 403 * @param delim 404 * the field delimiter matcher 405 * @param quote 406 * the field quoted string matcher 407 */ 408 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) { 409 this(input, delim); 410 setQuoteMatcher(quote); 411 } 412 413 /** 414 * Unsupported ListIterator operation. 415 * 416 * @param obj 417 * this parameter ignored. 418 * @throws UnsupportedOperationException 419 * always 420 */ 421 @Override 422 public void add(final String obj) { 423 throw new UnsupportedOperationException("add() is unsupported"); 424 } 425 426 /** 427 * Adds a token to a list, paying attention to the parameters we've set. 428 * 429 * @param list 430 * the list to add to 431 * @param tok 432 * the token to add 433 */ 434 private void addToken(final List<String> list, String tok) { 435 if (tok == null || tok.isEmpty()) { 436 if (isIgnoreEmptyTokens()) { 437 return; 438 } 439 if (isEmptyTokenAsNull()) { 440 tok = null; 441 } 442 } 443 list.add(tok); 444 } 445 446 /** 447 * Checks if tokenization has been done, and if not then do it. 448 */ 449 private void checkTokenized() { 450 if (tokens == null) { 451 final List<String> split; 452 if (chars == null) { 453 // still call tokenize as subclass may do some work 454 split = tokenize(null, 0, 0); 455 } else { 456 split = tokenize(chars, 0, chars.length); 457 } 458 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 459 } 460 } 461 462 /** 463 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 464 * list. If a {@link CloneNotSupportedException} is caught, return {@code null}. 465 * 466 * @return a new instance of this Tokenizer which has been reset. 467 */ 468 @Override 469 public Object clone() { 470 try { 471 return cloneReset(); 472 } catch (final CloneNotSupportedException ex) { 473 return null; 474 } 475 } 476 477 /** 478 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 479 * list. 480 * 481 * @return a new instance of this Tokenizer which has been reset. 482 * @throws CloneNotSupportedException 483 * if there is a problem cloning 484 */ 485 Object cloneReset() throws CloneNotSupportedException { 486 // this method exists to enable 100% test coverage 487 final StringTokenizer cloned = (StringTokenizer) super.clone(); 488 if (cloned.chars != null) { 489 cloned.chars = cloned.chars.clone(); 490 } 491 cloned.reset(); 492 return cloned; 493 } 494 495 /** 496 * Gets the String content that the tokenizer is parsing. 497 * 498 * @return The string content being parsed 499 */ 500 public String getContent() { 501 if (chars == null) { 502 return null; 503 } 504 return new String(chars); 505 } 506 507 /** 508 * Gets the field delimiter matcher. 509 * 510 * @return The delimiter matcher in use 511 */ 512 public StringMatcher getDelimiterMatcher() { 513 return this.delimMatcher; 514 } 515 516 /** 517 * Gets the ignored character matcher. 518 * <p> 519 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value 520 * is not to ignore anything. 521 * </p> 522 * 523 * @return The ignored matcher in use 524 */ 525 public StringMatcher getIgnoredMatcher() { 526 return ignoredMatcher; 527 } 528 529 /** 530 * Gets the quote matcher currently in use. 531 * <p> 532 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The 533 * default value is '"' (double quote). 534 * </p> 535 * 536 * @return The quote matcher in use 537 */ 538 public StringMatcher getQuoteMatcher() { 539 return quoteMatcher; 540 } 541 542 /** 543 * Gets a copy of the full token list as an independent modifiable array. 544 * 545 * @return The tokens as a String array 546 */ 547 public String[] getTokenArray() { 548 checkTokenized(); 549 return tokens.clone(); 550 } 551 552 /** 553 * Gets a copy of the full token list as an independent modifiable list. 554 * 555 * @return The tokens as a String list 556 */ 557 public List<String> getTokenList() { 558 checkTokenized(); 559 return new ArrayList<>(Arrays.asList(tokens)); 560 } 561 562 /** 563 * Gets the trimmer character matcher. 564 * <p> 565 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default 566 * value is not to trim anything. 567 * </p> 568 * 569 * @return The trimmer matcher in use 570 */ 571 public StringMatcher getTrimmerMatcher() { 572 return trimmerMatcher; 573 } 574 575 /** 576 * Tests whether there are any more tokens. 577 * 578 * @return true if there are more tokens 579 */ 580 @Override 581 public boolean hasNext() { 582 checkTokenized(); 583 return tokenPos < tokens.length; 584 } 585 586 /** 587 * Tests whether there are any previous tokens that can be iterated to. 588 * 589 * @return true if there are previous tokens 590 */ 591 @Override 592 public boolean hasPrevious() { 593 checkTokenized(); 594 return tokenPos > 0; 595 } 596 597 /** 598 * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false. 599 * 600 * @return true if empty tokens are returned as null 601 */ 602 public boolean isEmptyTokenAsNull() { 603 return this.emptyAsNull; 604 } 605 606 /** 607 * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true. 608 * 609 * @return true if empty tokens are not returned 610 */ 611 public boolean isIgnoreEmptyTokens() { 612 return ignoreEmptyTokens; 613 } 614 615 /** 616 * Tests if the characters at the index specified match the quote already matched in readNextToken(). 617 * 618 * @param srcChars 619 * the character array being tokenized 620 * @param pos 621 * the position to check for a quote 622 * @param len 623 * the length of the character array being tokenized 624 * @param quoteStart 625 * the start position of the matched quote, 0 if no quoting 626 * @param quoteLen 627 * the length of the matched quote, 0 if no quoting 628 * @return true if a quote is matched 629 */ 630 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, 631 final int quoteLen) { 632 for (int i = 0; i < quoteLen; i++) { 633 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 634 return false; 635 } 636 } 637 return true; 638 } 639 640 /** 641 * Gets the next token. 642 * 643 * @return The next String token 644 * @throws NoSuchElementException 645 * if there are no more elements 646 */ 647 @Override 648 public String next() { 649 if (hasNext()) { 650 return tokens[tokenPos++]; 651 } 652 throw new NoSuchElementException(); 653 } 654 655 /** 656 * Gets the index of the next token to return. 657 * 658 * @return The next token index 659 */ 660 @Override 661 public int nextIndex() { 662 return tokenPos; 663 } 664 665 /** 666 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing 667 * {@link NoSuchElementException} when no tokens remain. 668 * 669 * @return The next sequential token, or null when no more tokens are found 670 */ 671 public String nextToken() { 672 if (hasNext()) { 673 return tokens[tokenPos++]; 674 } 675 return null; 676 } 677 678 /** 679 * Gets the token previous to the last returned token. 680 * 681 * @return The previous token 682 */ 683 @Override 684 public String previous() { 685 if (hasPrevious()) { 686 return tokens[--tokenPos]; 687 } 688 throw new NoSuchElementException(); 689 } 690 691 /** 692 * Gets the index of the previous token. 693 * 694 * @return The previous token index 695 */ 696 @Override 697 public int previousIndex() { 698 return tokenPos - 1; 699 } 700 701 /** 702 * Gets the previous token from the String. 703 * 704 * @return The previous sequential token, or null when no more tokens are found 705 */ 706 public String previousToken() { 707 if (hasPrevious()) { 708 return tokens[--tokenPos]; 709 } 710 return null; 711 } 712 713 /** 714 * Reads character by character through the String to get the next token. 715 * 716 * @param srcChars 717 * the character array being tokenized 718 * @param start 719 * the first character of field 720 * @param len 721 * the length of the character array being tokenized 722 * @param workArea 723 * a temporary work area 724 * @param tokenList 725 * the list of parsed tokens 726 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of 727 * string found 728 */ 729 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea, 730 final List<String> tokenList) { 731 // skip all leading whitespace, unless it is the 732 // field delimiter or the quote character 733 while (start < len) { 734 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len), 735 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 736 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 737 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 738 break; 739 } 740 start += removeLen; 741 } 742 743 // handle reaching end 744 if (start >= len) { 745 addToken(tokenList, StringUtils.EMPTY); 746 return -1; 747 } 748 749 // handle empty token 750 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 751 if (delimLen > 0) { 752 addToken(tokenList, StringUtils.EMPTY); 753 return start + delimLen; 754 } 755 756 // handle found token 757 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 758 if (quoteLen > 0) { 759 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 760 } 761 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 762 } 763 764 /** 765 * Reads a possibly quoted string token. 766 * 767 * @param srcChars 768 * the character array being tokenized 769 * @param start 770 * the first character of field 771 * @param len 772 * the length of the character array being tokenized 773 * @param workArea 774 * a temporary work area 775 * @param tokenList 776 * the list of parsed tokens 777 * @param quoteStart 778 * the start position of the matched quote, 0 if no quoting 779 * @param quoteLen 780 * the length of the matched quote, 0 if no quoting 781 * @return The starting position of the next field (the character immediately after the delimiter, or if end of 782 * string found, then the length of string 783 */ 784 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea, 785 final List<String> tokenList, final int quoteStart, final int quoteLen) { 786 // Loop until we've found the end of the quoted 787 // string or the end of the input 788 workArea.clear(); 789 int pos = start; 790 boolean quoting = quoteLen > 0; 791 int trimStart = 0; 792 793 while (pos < len) { 794 // quoting mode can occur several times throughout a string 795 // we must switch between quoting and non-quoting until we 796 // encounter a non-quoted delimiter, or end of string 797 if (quoting) { 798 // In quoting mode 799 800 // If we've found a quote character, see if it's 801 // followed by a second quote. If so, then we need 802 // to actually put the quote character into the token 803 // rather than end the token. 804 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 805 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 806 // matched pair of quotes, thus an escaped quote 807 workArea.append(srcChars, pos, quoteLen); 808 pos += quoteLen * 2; 809 trimStart = workArea.size(); 810 continue; 811 } 812 813 // end of quoting 814 quoting = false; 815 pos += quoteLen; 816 continue; 817 } 818 819 } else { 820 // Not in quoting mode 821 822 // check for delimiter, and thus end of token 823 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 824 if (delimLen > 0) { 825 // return condition when end of token found 826 addToken(tokenList, workArea.substring(0, trimStart)); 827 return pos + delimLen; 828 } 829 830 // check for quote, and thus back into quoting mode 831 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 832 quoting = true; 833 pos += quoteLen; 834 continue; 835 } 836 837 // check for ignored (outside quotes), and ignore 838 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 839 if (ignoredLen > 0) { 840 pos += ignoredLen; 841 continue; 842 } 843 844 // check for trimmed character 845 // don't yet know if its at the end, so copy to workArea 846 // use trimStart to keep track of trim at the end 847 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 848 if (trimmedLen > 0) { 849 workArea.append(srcChars, pos, trimmedLen); 850 pos += trimmedLen; 851 continue; 852 } 853 } 854 // copy regular character from inside quotes 855 workArea.append(srcChars[pos++]); 856 trimStart = workArea.size(); 857 } 858 859 // return condition when end of string found 860 addToken(tokenList, workArea.substring(0, trimStart)); 861 return -1; 862 } 863 864 /** 865 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation. 866 * 867 * @throws UnsupportedOperationException 868 * always 869 */ 870 @Override 871 public void remove() { 872 throw new UnsupportedOperationException("remove() is unsupported"); 873 } 874 875 /** 876 * Resets this tokenizer, forgetting all parsing and iteration already completed. 877 * <p> 878 * This method allows the same tokenizer to be reused for the same String. 879 * </p> 880 * 881 * @return this, to enable chaining 882 */ 883 public StringTokenizer reset() { 884 tokenPos = 0; 885 tokens = null; 886 return this; 887 } 888 889 /** 890 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 891 * same settings on multiple input lines. 892 * 893 * @param input 894 * the new character array to tokenize, not cloned, null sets no text to parse 895 * @return this, to enable chaining 896 */ 897 public StringTokenizer reset(final char[] input) { 898 reset(); 899 this.chars = input != null ? input.clone() : null; 900 return this; 901 } 902 903 /** 904 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 905 * same settings on multiple input lines. 906 * 907 * @param input 908 * the new string to tokenize, null sets no text to parse 909 * @return this, to enable chaining 910 */ 911 public StringTokenizer reset(final String input) { 912 reset(); 913 this.chars = input != null ? input.toCharArray() : null; 914 return this; 915 } 916 917 /** 918 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation. 919 * 920 * @param obj 921 * this parameter ignored. 922 * @throws UnsupportedOperationException 923 * always 924 */ 925 @Override 926 public void set(final String obj) { 927 throw new UnsupportedOperationException("set() is unsupported"); 928 } 929 930 /** 931 * Sets the field delimiter character. 932 * 933 * @param delim 934 * the delimiter character to use 935 * @return this, to enable chaining 936 */ 937 public StringTokenizer setDelimiterChar(final char delim) { 938 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim)); 939 } 940 941 /** 942 * Sets the field delimiter matcher. 943 * <p> 944 * The delimiter is used to separate one token from another. 945 * </p> 946 * 947 * @param delim 948 * the delimiter matcher to use 949 * @return this, to enable chaining 950 */ 951 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) { 952 this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim; 953 return this; 954 } 955 956 /** 957 * Sets the field delimiter string. 958 * 959 * @param delim 960 * the delimiter string to use 961 * @return this, to enable chaining 962 */ 963 public StringTokenizer setDelimiterString(final String delim) { 964 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim)); 965 } 966 967 /** 968 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 969 * 970 * @param emptyAsNull 971 * whether empty tokens are returned as null 972 * @return this, to enable chaining 973 */ 974 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 975 this.emptyAsNull = emptyAsNull; 976 return this; 977 } 978 979 /** 980 * Sets the character to ignore. 981 * <p> 982 * This character is ignored when parsing the String, unless it is within a quoted region. 983 * </p> 984 * 985 * @param ignored 986 * the ignored character to use 987 * @return this, to enable chaining 988 */ 989 public StringTokenizer setIgnoredChar(final char ignored) { 990 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored)); 991 } 992 993 /** 994 * Sets the matcher for characters to ignore. 995 * <p> 996 * These characters are ignored when parsing the String, unless they are within a quoted region. 997 * </p> 998 * 999 * @param ignored 1000 * the ignored matcher to use, null ignored 1001 * @return this, to enable chaining 1002 */ 1003 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) { 1004 if (ignored != null) { 1005 this.ignoredMatcher = ignored; 1006 } 1007 return this; 1008 } 1009 1010 /** 1011 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 1012 * 1013 * @param ignoreEmptyTokens 1014 * whether empty tokens are not returned 1015 * @return this, to enable chaining 1016 */ 1017 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1018 this.ignoreEmptyTokens = ignoreEmptyTokens; 1019 return this; 1020 } 1021 1022 /** 1023 * Sets the quote character to use. 1024 * <p> 1025 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 1026 * </p> 1027 * 1028 * @param quote 1029 * the quote character to use 1030 * @return this, to enable chaining 1031 */ 1032 public StringTokenizer setQuoteChar(final char quote) { 1033 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote)); 1034 } 1035 1036 /** 1037 * Sets the quote matcher to use. 1038 * <p> 1039 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 1040 * </p> 1041 * 1042 * @param quote 1043 * the quote matcher to use, null ignored 1044 * @return this, to enable chaining 1045 */ 1046 public StringTokenizer setQuoteMatcher(final StringMatcher quote) { 1047 if (quote != null) { 1048 this.quoteMatcher = quote; 1049 } 1050 return this; 1051 } 1052 1053 /** 1054 * Sets the matcher for characters to trim. 1055 * <p> 1056 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1057 * 1058 * @param trimmer 1059 * the trimmer matcher to use, null ignored 1060 * @return this, to enable chaining 1061 */ 1062 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) { 1063 if (trimmer != null) { 1064 this.trimmerMatcher = trimmer; 1065 } 1066 return this; 1067 } 1068 1069 /** 1070 * Gets the number of tokens found in the String. 1071 * 1072 * @return The number of matched tokens 1073 */ 1074 public int size() { 1075 checkTokenized(); 1076 return tokens.length; 1077 } 1078 1079 /** 1080 * Internal method to performs the tokenization. 1081 * <p> 1082 * Most users of this class do not need to call this method. This method will be called automatically by other 1083 * (public) methods when required. 1084 * </p> 1085 * <p> 1086 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass 1087 * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple 1088 * strings. It is also be possible to filter the results. 1089 * </p> 1090 * <p> 1091 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this 1092 * method, however a subclass may pass other values, or even an entirely different array. 1093 * </p> 1094 * 1095 * @param srcChars 1096 * the character array being tokenized, may be null 1097 * @param offset 1098 * the start position within the character array, must be valid 1099 * @param count 1100 * the number of characters to tokenize, must be valid 1101 * @return The modifiable list of String tokens, unmodifiable if null array or zero count 1102 */ 1103 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1104 if (srcChars == null || count == 0) { 1105 return Collections.emptyList(); 1106 } 1107 final TextStringBuilder buf = new TextStringBuilder(); 1108 final List<String> tokenList = new ArrayList<>(); 1109 int pos = offset; 1110 1111 // loop around the entire buffer 1112 while (pos >= 0 && pos < count) { 1113 // find next token 1114 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1115 1116 // handle case where end of string is a delimiter 1117 if (pos >= count) { 1118 addToken(tokenList, StringUtils.EMPTY); 1119 } 1120 } 1121 return tokenList; 1122 } 1123 1124 /** 1125 * Gets the String content that the tokenizer is parsing. 1126 * 1127 * @return The string content being parsed 1128 */ 1129 @Override 1130 public String toString() { 1131 if (tokens == null) { 1132 return "StringTokenizer[not tokenized yet]"; 1133 } 1134 return "StringTokenizer" + getTokenList(); 1135 } 1136 1137}