1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text; 18 19 import java.util.ArrayList; 20 import java.util.Arrays; 21 import java.util.Collections; 22 import java.util.List; 23 import java.util.ListIterator; 24 import java.util.NoSuchElementException; 25 26 import org.apache.commons.lang3.ArrayUtils; 27 import org.apache.commons.lang3.StringUtils; 28 import org.apache.commons.text.matcher.StringMatcher; 29 import org.apache.commons.text.matcher.StringMatcherFactory; 30 31 /** 32 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts. 33 * <p> 34 * This class can split a String into many smaller strings. It aims to do a similar job to 35 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including 36 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}. 37 * <p> 38 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a 39 * <em>delimiter</em>. One or more delimiter characters must be specified. 40 * <p> 41 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be 42 * escaped within a quoted section by duplicating itself. 43 * <p> 44 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher 45 * specifies these characters. One usage might be to trim whitespace characters. 46 * <p> 47 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies 48 * these characters to be removed. One usage might be to remove new line characters. 49 * <p> 50 * Empty tokens may be removed or returned as null. 51 * 52 * <pre> 53 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 54 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 55 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 56 * </pre> 57 * 58 * <table> 59 * <caption>StringTokenizer properties and options</caption> 60 * <tr> 61 * <th>Property</th> 62 * <th>Type</th> 63 * <th>Default</th> 64 * </tr> 65 * <tr> 66 * <td>delim</td> 67 * <td>CharSetMatcher</td> 68 * <td>{ \t\n\r\f}</td> 69 * </tr> 70 * <tr> 71 * <td>quote</td> 72 * <td>NoneMatcher</td> 73 * <td>{}</td> 74 * </tr> 75 * <tr> 76 * <td>ignore</td> 77 * <td>NoneMatcher</td> 78 * <td>{}</td> 79 * </tr> 80 * <tr> 81 * <td>emptyTokenAsNull</td> 82 * <td>boolean</td> 83 * <td>false</td> 84 * </tr> 85 * <tr> 86 * <td>ignoreEmptyTokens</td> 87 * <td>boolean</td> 88 * <td>true</td> 89 * </tr> 90 * </table> 91 * 92 * @since 1.3 93 */ 94 public class StringTokenizer implements ListIterator<String>, Cloneable { 95 96 /** Comma separated values tokenizer internal variable. */ 97 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE; 98 99 /** Tab separated values tokenizer internal variable. */ 100 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE; 101 102 static { 103 CSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 104 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher()); 105 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 106 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 107 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 108 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 109 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 110 111 TSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 112 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher()); 113 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 114 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 115 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 116 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 117 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 118 } 119 120 /** 121 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 122 * 123 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 124 */ 125 private static StringTokenizer getCSVClone() { 126 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 127 } 128 129 /** 130 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 131 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 132 * setTrimmer method). 133 * <p> 134 * You must call a "reset" method to set the string which you want to parse. 135 * </p> 136 * 137 * @return a new tokenizer instance which parses Comma Separated Value strings 138 */ 139 public static StringTokenizer getCSVInstance() { 140 return getCSVClone(); 141 } 142 143 /** 144 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 145 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 146 * setTrimmer method). 147 * 148 * @param input 149 * the text to parse 150 * @return a new tokenizer instance which parses Comma Separated Value strings 151 */ 152 public static StringTokenizer getCSVInstance(final char[] input) { 153 return getCSVClone().reset(input); 154 } 155 156 /** 157 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 158 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 159 * setTrimmer method). 160 * 161 * @param input 162 * the text to parse 163 * @return a new tokenizer instance which parses Comma Separated Value strings 164 */ 165 public static StringTokenizer getCSVInstance(final String input) { 166 return getCSVClone().reset(input); 167 } 168 169 /** 170 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 171 * 172 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 173 */ 174 private static StringTokenizer getTSVClone() { 175 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 176 } 177 178 /** 179 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 180 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 181 * <p> 182 * You must call a "reset" method to set the string which you want to parse. 183 * </p> 184 * 185 * @return a new tokenizer instance which parses Tab Separated Value strings. 186 */ 187 public static StringTokenizer getTSVInstance() { 188 return getTSVClone(); 189 } 190 191 /** 192 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 193 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 194 * 195 * @param input 196 * the string to parse 197 * @return a new tokenizer instance which parses Tab Separated Value strings. 198 */ 199 public static StringTokenizer getTSVInstance(final char[] input) { 200 return getTSVClone().reset(input); 201 } 202 203 /** 204 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 205 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 206 * 207 * @param input 208 * the string to parse 209 * @return a new tokenizer instance which parses Tab Separated Value strings. 210 */ 211 public static StringTokenizer getTSVInstance(final String input) { 212 return getTSVClone().reset(input); 213 } 214 215 /** The text to work on. */ 216 private char[] chars; 217 218 /** The parsed tokens. */ 219 private String[] tokens; 220 221 /** The current iteration position. */ 222 private int tokenPos; 223 224 /** The delimiter matcher. */ 225 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher(); 226 227 /** The quote matcher. */ 228 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 229 230 /** The ignored matcher. */ 231 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 232 233 /** The trimmer matcher. */ 234 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 235 236 /** Whether to return empty tokens as null. */ 237 private boolean emptyAsNull; 238 239 /** Whether to ignore empty tokens. */ 240 private boolean ignoreEmptyTokens = true; 241 242 /** 243 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to 244 * tokenize. 245 * <p> 246 * This constructor is normally used with {@link #reset(String)}. 247 * </p> 248 */ 249 public StringTokenizer() { 250 this.chars = null; 251 } 252 253 /** 254 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 255 * 256 * @param input 257 * the string which is to be parsed, not cloned 258 */ 259 public StringTokenizer(final char[] input) { 260 this.chars = input != null ? input.clone() : null; 261 } 262 263 /** 264 * Constructs a tokenizer splitting on the specified character. 265 * 266 * @param input 267 * the string which is to be parsed, not cloned 268 * @param delim 269 * the field delimiter character 270 */ 271 public StringTokenizer(final char[] input, final char delim) { 272 this(input); 273 setDelimiterChar(delim); 274 } 275 276 /** 277 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 278 * quote character. 279 * 280 * @param input 281 * the string which is to be parsed, not cloned 282 * @param delim 283 * the field delimiter character 284 * @param quote 285 * the field quoted string character 286 */ 287 public StringTokenizer(final char[] input, final char delim, final char quote) { 288 this(input, delim); 289 setQuoteChar(quote); 290 } 291 292 /** 293 * Constructs a tokenizer splitting on the specified string. 294 * 295 * @param input 296 * the string which is to be parsed, not cloned 297 * @param delim 298 * the field delimiter string 299 */ 300 public StringTokenizer(final char[] input, final String delim) { 301 this(input); 302 setDelimiterString(delim); 303 } 304 305 /** 306 * Constructs a tokenizer splitting using the specified delimiter matcher. 307 * 308 * @param input 309 * the string which is to be parsed, not cloned 310 * @param delim 311 * the field delimiter matcher 312 */ 313 public StringTokenizer(final char[] input, final StringMatcher delim) { 314 this(input); 315 setDelimiterMatcher(delim); 316 } 317 318 /** 319 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 320 * quote matcher. 321 * 322 * @param input 323 * the string which is to be parsed, not cloned 324 * @param delim 325 * the field delimiter character 326 * @param quote 327 * the field quoted string character 328 */ 329 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) { 330 this(input, delim); 331 setQuoteMatcher(quote); 332 } 333 334 /** 335 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 336 * 337 * @param input 338 * the string which is to be parsed 339 */ 340 public StringTokenizer(final String input) { 341 this.chars = input != null ? input.toCharArray() : null; 342 } 343 344 /** 345 * Constructs a tokenizer splitting on the specified delimiter character. 346 * 347 * @param input 348 * the string which is to be parsed 349 * @param delim 350 * the field delimiter character 351 */ 352 public StringTokenizer(final String input, final char delim) { 353 this(input); 354 setDelimiterChar(delim); 355 } 356 357 /** 358 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 359 * quote character. 360 * 361 * @param input 362 * the string which is to be parsed 363 * @param delim 364 * the field delimiter character 365 * @param quote 366 * the field quoted string character 367 */ 368 public StringTokenizer(final String input, final char delim, final char quote) { 369 this(input, delim); 370 setQuoteChar(quote); 371 } 372 373 /** 374 * Constructs a tokenizer splitting on the specified delimiter string. 375 * 376 * @param input 377 * the string which is to be parsed 378 * @param delim 379 * the field delimiter string 380 */ 381 public StringTokenizer(final String input, final String delim) { 382 this(input); 383 setDelimiterString(delim); 384 } 385 386 /** 387 * Constructs a tokenizer splitting using the specified delimiter matcher. 388 * 389 * @param input 390 * the string which is to be parsed 391 * @param delim 392 * the field delimiter matcher 393 */ 394 public StringTokenizer(final String input, final StringMatcher delim) { 395 this(input); 396 setDelimiterMatcher(delim); 397 } 398 399 /** 400 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 401 * quote matcher. 402 * 403 * @param input 404 * the string which is to be parsed 405 * @param delim 406 * the field delimiter matcher 407 * @param quote 408 * the field quoted string matcher 409 */ 410 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) { 411 this(input, delim); 412 setQuoteMatcher(quote); 413 } 414 415 /** 416 * Unsupported ListIterator operation. 417 * 418 * @param obj 419 * this parameter ignored. 420 * @throws UnsupportedOperationException 421 * always 422 */ 423 @Override 424 public void add(final String obj) { 425 throw new UnsupportedOperationException("add() is unsupported"); 426 } 427 428 /** 429 * Adds a token to a list, paying attention to the parameters we've set. 430 * 431 * @param list 432 * the list to add to 433 * @param tok 434 * the token to add 435 */ 436 private void addToken(final List<String> list, String tok) { 437 if (tok == null || tok.isEmpty()) { 438 if (isIgnoreEmptyTokens()) { 439 return; 440 } 441 if (isEmptyTokenAsNull()) { 442 tok = null; 443 } 444 } 445 list.add(tok); 446 } 447 448 /** 449 * Checks if tokenization has been done, and if not then do it. 450 */ 451 private void checkTokenized() { 452 if (tokens == null) { 453 final List<String> split; 454 if (chars == null) { 455 // still call tokenize as subclass may do some work 456 split = tokenize(null, 0, 0); 457 } else { 458 split = tokenize(chars, 0, chars.length); 459 } 460 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 461 } 462 } 463 464 /** 465 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 466 * list. If a {@link CloneNotSupportedException} is caught, return {@code null}. 467 * 468 * @return a new instance of this Tokenizer which has been reset. 469 */ 470 @Override 471 public Object clone() { 472 try { 473 return cloneReset(); 474 } catch (final CloneNotSupportedException ex) { 475 return null; 476 } 477 } 478 479 /** 480 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 481 * list. 482 * 483 * @return a new instance of this Tokenizer which has been reset. 484 * @throws CloneNotSupportedException 485 * if there is a problem cloning 486 */ 487 Object cloneReset() throws CloneNotSupportedException { 488 // this method exists to enable 100% test coverage 489 final StringTokenizer cloned = (StringTokenizer) super.clone(); 490 if (cloned.chars != null) { 491 cloned.chars = cloned.chars.clone(); 492 } 493 cloned.reset(); 494 return cloned; 495 } 496 497 /** 498 * Gets the String content that the tokenizer is parsing. 499 * 500 * @return The string content being parsed 501 */ 502 public String getContent() { 503 if (chars == null) { 504 return null; 505 } 506 return new String(chars); 507 } 508 509 /** 510 * Gets the field delimiter matcher. 511 * 512 * @return The delimiter matcher in use 513 */ 514 public StringMatcher getDelimiterMatcher() { 515 return this.delimMatcher; 516 } 517 518 /** 519 * Gets the ignored character matcher. 520 * <p> 521 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value 522 * is not to ignore anything. 523 * </p> 524 * 525 * @return The ignored matcher in use 526 */ 527 public StringMatcher getIgnoredMatcher() { 528 return ignoredMatcher; 529 } 530 531 /** 532 * Gets the quote matcher currently in use. 533 * <p> 534 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The 535 * default value is '"' (double quote). 536 * </p> 537 * 538 * @return The quote matcher in use 539 */ 540 public StringMatcher getQuoteMatcher() { 541 return quoteMatcher; 542 } 543 544 /** 545 * Gets a copy of the full token list as an independent modifiable array. 546 * 547 * @return The tokens as a String array 548 */ 549 public String[] getTokenArray() { 550 checkTokenized(); 551 return tokens.clone(); 552 } 553 554 /** 555 * Gets a copy of the full token list as an independent modifiable list. 556 * 557 * @return The tokens as a String list 558 */ 559 public List<String> getTokenList() { 560 checkTokenized(); 561 return new ArrayList<>(Arrays.asList(tokens)); 562 } 563 564 /** 565 * Gets the trimmer character matcher. 566 * <p> 567 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default 568 * value is not to trim anything. 569 * </p> 570 * 571 * @return The trimmer matcher in use 572 */ 573 public StringMatcher getTrimmerMatcher() { 574 return trimmerMatcher; 575 } 576 577 /** 578 * Tests whether there are any more tokens. 579 * 580 * @return true if there are more tokens 581 */ 582 @Override 583 public boolean hasNext() { 584 checkTokenized(); 585 return tokenPos < tokens.length; 586 } 587 588 /** 589 * Tests whether there are any previous tokens that can be iterated to. 590 * 591 * @return true if there are previous tokens 592 */ 593 @Override 594 public boolean hasPrevious() { 595 checkTokenized(); 596 return tokenPos > 0; 597 } 598 599 /** 600 * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false. 601 * 602 * @return true if empty tokens are returned as null 603 */ 604 public boolean isEmptyTokenAsNull() { 605 return this.emptyAsNull; 606 } 607 608 /** 609 * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true. 610 * 611 * @return true if empty tokens are not returned 612 */ 613 public boolean isIgnoreEmptyTokens() { 614 return ignoreEmptyTokens; 615 } 616 617 /** 618 * Tests if the characters at the index specified match the quote already matched in readNextToken(). 619 * 620 * @param srcChars 621 * the character array being tokenized 622 * @param pos 623 * the position to check for a quote 624 * @param len 625 * the length of the character array being tokenized 626 * @param quoteStart 627 * the start position of the matched quote, 0 if no quoting 628 * @param quoteLen 629 * the length of the matched quote, 0 if no quoting 630 * @return true if a quote is matched 631 */ 632 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, 633 final int quoteLen) { 634 for (int i = 0; i < quoteLen; i++) { 635 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 636 return false; 637 } 638 } 639 return true; 640 } 641 642 /** 643 * Gets the next token. 644 * 645 * @return The next String token 646 * @throws NoSuchElementException 647 * if there are no more elements 648 */ 649 @Override 650 public String next() { 651 if (hasNext()) { 652 return tokens[tokenPos++]; 653 } 654 throw new NoSuchElementException(); 655 } 656 657 /** 658 * Gets the index of the next token to return. 659 * 660 * @return The next token index 661 */ 662 @Override 663 public int nextIndex() { 664 return tokenPos; 665 } 666 667 /** 668 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing 669 * {@link NoSuchElementException} when no tokens remain. 670 * 671 * @return The next sequential token, or null when no more tokens are found 672 */ 673 public String nextToken() { 674 if (hasNext()) { 675 return tokens[tokenPos++]; 676 } 677 return null; 678 } 679 680 /** 681 * Gets the token previous to the last returned token. 682 * 683 * @return The previous token 684 */ 685 @Override 686 public String previous() { 687 if (hasPrevious()) { 688 return tokens[--tokenPos]; 689 } 690 throw new NoSuchElementException(); 691 } 692 693 /** 694 * Gets the index of the previous token. 695 * 696 * @return The previous token index 697 */ 698 @Override 699 public int previousIndex() { 700 return tokenPos - 1; 701 } 702 703 /** 704 * Gets the previous token from the String. 705 * 706 * @return The previous sequential token, or null when no more tokens are found 707 */ 708 public String previousToken() { 709 if (hasPrevious()) { 710 return tokens[--tokenPos]; 711 } 712 return null; 713 } 714 715 /** 716 * Reads character by character through the String to get the next token. 717 * 718 * @param srcChars 719 * the character array being tokenized 720 * @param start 721 * the first character of field 722 * @param len 723 * the length of the character array being tokenized 724 * @param workArea 725 * a temporary work area 726 * @param tokenList 727 * the list of parsed tokens 728 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of 729 * string found 730 */ 731 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea, 732 final List<String> tokenList) { 733 // skip all leading whitespace, unless it is the 734 // field delimiter or the quote character 735 while (start < len) { 736 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len), 737 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 738 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 739 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 740 break; 741 } 742 start += removeLen; 743 } 744 745 // handle reaching end 746 if (start >= len) { 747 addToken(tokenList, StringUtils.EMPTY); 748 return -1; 749 } 750 751 // handle empty token 752 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 753 if (delimLen > 0) { 754 addToken(tokenList, StringUtils.EMPTY); 755 return start + delimLen; 756 } 757 758 // handle found token 759 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 760 if (quoteLen > 0) { 761 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 762 } 763 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 764 } 765 766 /** 767 * Reads a possibly quoted string token. 768 * 769 * @param srcChars 770 * the character array being tokenized 771 * @param start 772 * the first character of field 773 * @param len 774 * the length of the character array being tokenized 775 * @param workArea 776 * a temporary work area 777 * @param tokenList 778 * the list of parsed tokens 779 * @param quoteStart 780 * the start position of the matched quote, 0 if no quoting 781 * @param quoteLen 782 * the length of the matched quote, 0 if no quoting 783 * @return The starting position of the next field (the character immediately after the delimiter, or if end of 784 * string found, then the length of string 785 */ 786 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea, 787 final List<String> tokenList, final int quoteStart, final int quoteLen) { 788 // Loop until we've found the end of the quoted 789 // string or the end of the input 790 workArea.clear(); 791 int pos = start; 792 boolean quoting = quoteLen > 0; 793 int trimStart = 0; 794 795 while (pos < len) { 796 // quoting mode can occur several times throughout a string 797 // we must switch between quoting and non-quoting until we 798 // encounter a non-quoted delimiter, or end of string 799 if (quoting) { 800 // In quoting mode 801 802 // If we've found a quote character, see if it's 803 // followed by a second quote. If so, then we need 804 // to actually put the quote character into the token 805 // rather than end the token. 806 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 807 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 808 // matched pair of quotes, thus an escaped quote 809 workArea.append(srcChars, pos, quoteLen); 810 pos += quoteLen * 2; 811 trimStart = workArea.size(); 812 continue; 813 } 814 815 // end of quoting 816 quoting = false; 817 pos += quoteLen; 818 continue; 819 } 820 821 } else { 822 // Not in quoting mode 823 824 // check for delimiter, and thus end of token 825 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 826 if (delimLen > 0) { 827 // return condition when end of token found 828 addToken(tokenList, workArea.substring(0, trimStart)); 829 return pos + delimLen; 830 } 831 832 // check for quote, and thus back into quoting mode 833 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 834 quoting = true; 835 pos += quoteLen; 836 continue; 837 } 838 839 // check for ignored (outside quotes), and ignore 840 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 841 if (ignoredLen > 0) { 842 pos += ignoredLen; 843 continue; 844 } 845 846 // check for trimmed character 847 // don't yet know if its at the end, so copy to workArea 848 // use trimStart to keep track of trim at the end 849 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 850 if (trimmedLen > 0) { 851 workArea.append(srcChars, pos, trimmedLen); 852 pos += trimmedLen; 853 continue; 854 } 855 } 856 // copy regular character from inside quotes 857 workArea.append(srcChars[pos++]); 858 trimStart = workArea.size(); 859 } 860 861 // return condition when end of string found 862 addToken(tokenList, workArea.substring(0, trimStart)); 863 return -1; 864 } 865 866 /** 867 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation. 868 * 869 * @throws UnsupportedOperationException 870 * always 871 */ 872 @Override 873 public void remove() { 874 throw new UnsupportedOperationException("remove() is unsupported"); 875 } 876 877 /** 878 * Resets this tokenizer, forgetting all parsing and iteration already completed. 879 * <p> 880 * This method allows the same tokenizer to be reused for the same String. 881 * </p> 882 * 883 * @return this, to enable chaining 884 */ 885 public StringTokenizer reset() { 886 tokenPos = 0; 887 tokens = null; 888 return this; 889 } 890 891 /** 892 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 893 * same settings on multiple input lines. 894 * 895 * @param input 896 * the new character array to tokenize, not cloned, null sets no text to parse 897 * @return this, to enable chaining 898 */ 899 public StringTokenizer reset(final char[] input) { 900 reset(); 901 this.chars = input != null ? input.clone() : null; 902 return this; 903 } 904 905 /** 906 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 907 * same settings on multiple input lines. 908 * 909 * @param input 910 * the new string to tokenize, null sets no text to parse 911 * @return this, to enable chaining 912 */ 913 public StringTokenizer reset(final String input) { 914 reset(); 915 this.chars = input != null ? input.toCharArray() : null; 916 return this; 917 } 918 919 /** 920 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation. 921 * 922 * @param obj 923 * this parameter ignored. 924 * @throws UnsupportedOperationException 925 * always 926 */ 927 @Override 928 public void set(final String obj) { 929 throw new UnsupportedOperationException("set() is unsupported"); 930 } 931 932 /** 933 * Sets the field delimiter character. 934 * 935 * @param delim 936 * the delimiter character to use 937 * @return this, to enable chaining 938 */ 939 public StringTokenizer setDelimiterChar(final char delim) { 940 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim)); 941 } 942 943 /** 944 * Sets the field delimiter matcher. 945 * <p> 946 * The delimiter is used to separate one token from another. 947 * </p> 948 * 949 * @param delim 950 * the delimiter matcher to use 951 * @return this, to enable chaining 952 */ 953 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) { 954 this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim; 955 return this; 956 } 957 958 /** 959 * Sets the field delimiter string. 960 * 961 * @param delim 962 * the delimiter string to use 963 * @return this, to enable chaining 964 */ 965 public StringTokenizer setDelimiterString(final String delim) { 966 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim)); 967 } 968 969 /** 970 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 971 * 972 * @param emptyAsNull 973 * whether empty tokens are returned as null 974 * @return this, to enable chaining 975 */ 976 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 977 this.emptyAsNull = emptyAsNull; 978 return this; 979 } 980 981 /** 982 * Sets the character to ignore. 983 * <p> 984 * This character is ignored when parsing the String, unless it is within a quoted region. 985 * </p> 986 * 987 * @param ignored 988 * the ignored character to use 989 * @return this, to enable chaining 990 */ 991 public StringTokenizer setIgnoredChar(final char ignored) { 992 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored)); 993 } 994 995 /** 996 * Sets the matcher for characters to ignore. 997 * <p> 998 * These characters are ignored when parsing the String, unless they are within a quoted region. 999 * </p> 1000 * 1001 * @param ignored 1002 * the ignored matcher to use, null ignored 1003 * @return this, to enable chaining 1004 */ 1005 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) { 1006 if (ignored != null) { 1007 this.ignoredMatcher = ignored; 1008 } 1009 return this; 1010 } 1011 1012 /** 1013 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 1014 * 1015 * @param ignoreEmptyTokens 1016 * whether empty tokens are not returned 1017 * @return this, to enable chaining 1018 */ 1019 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1020 this.ignoreEmptyTokens = ignoreEmptyTokens; 1021 return this; 1022 } 1023 1024 /** 1025 * Sets the quote character to use. 1026 * <p> 1027 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 1028 * </p> 1029 * 1030 * @param quote 1031 * the quote character to use 1032 * @return this, to enable chaining 1033 */ 1034 public StringTokenizer setQuoteChar(final char quote) { 1035 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote)); 1036 } 1037 1038 /** 1039 * Sets the quote matcher to use. 1040 * <p> 1041 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 1042 * </p> 1043 * 1044 * @param quote 1045 * the quote matcher to use, null ignored 1046 * @return this, to enable chaining 1047 */ 1048 public StringTokenizer setQuoteMatcher(final StringMatcher quote) { 1049 if (quote != null) { 1050 this.quoteMatcher = quote; 1051 } 1052 return this; 1053 } 1054 1055 /** 1056 * Sets the matcher for characters to trim. 1057 * <p> 1058 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1059 * 1060 * @param trimmer 1061 * the trimmer matcher to use, null ignored 1062 * @return this, to enable chaining 1063 */ 1064 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) { 1065 if (trimmer != null) { 1066 this.trimmerMatcher = trimmer; 1067 } 1068 return this; 1069 } 1070 1071 /** 1072 * Gets the number of tokens found in the String. 1073 * 1074 * @return The number of matched tokens 1075 */ 1076 public int size() { 1077 checkTokenized(); 1078 return tokens.length; 1079 } 1080 1081 /** 1082 * Internal method to performs the tokenization. 1083 * <p> 1084 * Most users of this class do not need to call this method. This method will be called automatically by other 1085 * (public) methods when required. 1086 * </p> 1087 * <p> 1088 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass 1089 * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple 1090 * strings. It is also be possible to filter the results. 1091 * </p> 1092 * <p> 1093 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this 1094 * method, however a subclass may pass other values, or even an entirely different array. 1095 * </p> 1096 * 1097 * @param srcChars 1098 * the character array being tokenized, may be null 1099 * @param offset 1100 * the start position within the character array, must be valid 1101 * @param count 1102 * the number of characters to tokenize, must be valid 1103 * @return The modifiable list of String tokens, unmodifiable if null array or zero count 1104 */ 1105 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1106 if (srcChars == null || count == 0) { 1107 return Collections.emptyList(); 1108 } 1109 final TextStringBuilder buf = new TextStringBuilder(); 1110 final List<String> tokenList = new ArrayList<>(); 1111 int pos = offset; 1112 1113 // loop around the entire buffer 1114 while (pos >= 0 && pos < count) { 1115 // find next token 1116 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1117 1118 // handle case where end of string is a delimiter 1119 if (pos >= count) { 1120 addToken(tokenList, StringUtils.EMPTY); 1121 } 1122 } 1123 return tokenList; 1124 } 1125 1126 /** 1127 * Gets the String content that the tokenizer is parsing. 1128 * 1129 * @return The string content being parsed 1130 */ 1131 @Override 1132 public String toString() { 1133 if (tokens == null) { 1134 return "StringTokenizer[not tokenized yet]"; 1135 } 1136 return "StringTokenizer" + getTokenList(); 1137 } 1138 1139 }