1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text; 18 19 import java.util.ArrayList; 20 import java.util.Collections; 21 import java.util.List; 22 import java.util.ListIterator; 23 import java.util.NoSuchElementException; 24 25 import org.apache.commons.lang3.ArrayUtils; 26 import org.apache.commons.lang3.StringUtils; 27 28 /** 29 * Tokenizes a string based on delimiters (separators) 30 * and supporting quoting and ignored character concepts. 31 * <p> 32 * This class can split a String into many smaller strings. It aims 33 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 34 * however it offers much more control and flexibility including implementing 35 * the {@code ListIterator} interface. By default, it is set up 36 * like {@code StringTokenizer}. 37 * <p> 38 * The input String is split into a number of <em>tokens</em>. 39 * Each token is separated from the next String by a <em>delimiter</em>. 40 * One or more delimiter characters must be specified. 41 * <p> 42 * Each token may be surrounded by quotes. 43 * The <em>quote</em> matcher specifies the quote character(s). 44 * A quote may be escaped within a quoted section by duplicating itself. 45 * <p> 46 * Between each token and the delimiter are potentially characters that need trimming. 47 * The <em>trimmer</em> matcher specifies these characters. 48 * One usage might be to trim whitespace characters. 49 * <p> 50 * At any point outside the quotes there might potentially be invalid characters. 51 * The <em>ignored</em> matcher specifies these characters to be removed. 52 * One usage might be to remove new line characters. 53 * <p> 54 * Empty tokens may be removed or returned as null. 55 * <pre> 56 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 57 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 58 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 59 * </pre> 60 * 61 * <table> 62 * <caption>StrTokenizer properties and options</caption> 63 * <tr> 64 * <th>Property</th><th>Type</th><th>Default</th> 65 * </tr> 66 * <tr> 67 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 68 * </tr> 69 * <tr> 70 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 71 * </tr> 72 * <tr> 73 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 74 * </tr> 75 * <tr> 76 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 77 * </tr> 78 * <tr> 79 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 80 * </tr> 81 * </table> 82 * 83 * @since 1.0 84 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0. 85 */ 86 @Deprecated 87 public class StrTokenizer implements ListIterator<String>, Cloneable { 88 89 /** Comma separated values tokenizer internal variable. */ 90 // @formatter:off 91 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 92 .setDelimiterMatcher(StrMatcher.commaMatcher()) 93 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 94 .setIgnoredMatcher(StrMatcher.noneMatcher()) 95 .setTrimmerMatcher(StrMatcher.trimMatcher()) 96 .setEmptyTokenAsNull(false) 97 .setIgnoreEmptyTokens(false); 98 // @formatter:on 99 100 /** Tab separated values tokenizer internal variable. */ 101 // @formatter:off 102 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 103 .setDelimiterMatcher(StrMatcher.tabMatcher()) 104 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 105 .setIgnoredMatcher(StrMatcher.noneMatcher()) 106 .setTrimmerMatcher(StrMatcher.trimMatcher()) 107 .setEmptyTokenAsNull(false) 108 .setIgnoreEmptyTokens(false); 109 // @formatter:on 110 111 /** 112 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 113 * 114 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 115 */ 116 private static StrTokenizer getCSVClone() { 117 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 118 } 119 120 /** 121 * Gets a new tokenizer instance which parses Comma Separated Value strings 122 * initializing it with the given input. The default for CSV processing 123 * will be trim whitespace from both ends (which can be overridden with 124 * the setTrimmer method). 125 * <p> 126 * You must call a "reset" method to set the string which you want to parse. 127 * </p> 128 * @return a new tokenizer instance which parses Comma Separated Value strings 129 */ 130 public static StrTokenizer getCSVInstance() { 131 return getCSVClone(); 132 } 133 134 /** 135 * Gets a new tokenizer instance which parses Comma Separated Value strings 136 * initializing it with the given input. The default for CSV processing 137 * will be trim whitespace from both ends (which can be overridden with 138 * the setTrimmer method). 139 * 140 * @param input the text to parse 141 * @return a new tokenizer instance which parses Comma Separated Value strings 142 */ 143 public static StrTokenizer getCSVInstance(final char[] input) { 144 final StrTokenizer tok = getCSVClone(); 145 tok.reset(input); 146 return tok; 147 } 148 149 /** 150 * Gets a new tokenizer instance which parses Comma Separated Value strings 151 * initializing it with the given input. The default for CSV processing 152 * will be trim whitespace from both ends (which can be overridden with 153 * the setTrimmer method). 154 * 155 * @param input the text to parse 156 * @return a new tokenizer instance which parses Comma Separated Value strings 157 */ 158 public static StrTokenizer getCSVInstance(final String input) { 159 final StrTokenizer tok = getCSVClone(); 160 tok.reset(input); 161 return tok; 162 } 163 /** 164 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 165 * 166 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 167 */ 168 private static StrTokenizer getTSVClone() { 169 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 170 } 171 172 /** 173 * Gets a new tokenizer instance which parses Tab Separated Value strings. 174 * The default for CSV processing will be trim whitespace from both ends 175 * (which can be overridden with the setTrimmer method). 176 * <p> 177 * You must call a "reset" method to set the string which you want to parse. 178 * </p> 179 * @return a new tokenizer instance which parses Tab Separated Value strings. 180 */ 181 public static StrTokenizer getTSVInstance() { 182 return getTSVClone(); 183 } 184 185 /** 186 * Gets a new tokenizer instance which parses Tab Separated Value strings. 187 * The default for CSV processing will be trim whitespace from both ends 188 * (which can be overridden with the setTrimmer method). 189 * @param input the string to parse 190 * @return a new tokenizer instance which parses Tab Separated Value strings. 191 */ 192 public static StrTokenizer getTSVInstance(final char[] input) { 193 final StrTokenizer tok = getTSVClone(); 194 tok.reset(input); 195 return tok; 196 } 197 198 /** 199 * Gets a new tokenizer instance which parses Tab Separated Value strings. 200 * The default for CSV processing will be trim whitespace from both ends 201 * (which can be overridden with the setTrimmer method). 202 * @param input the string to parse 203 * @return a new tokenizer instance which parses Tab Separated Value strings. 204 */ 205 public static StrTokenizer getTSVInstance(final String input) { 206 final StrTokenizer tok = getTSVClone(); 207 tok.reset(input); 208 return tok; 209 } 210 211 /** The text to work on. */ 212 private char[] chars; 213 214 /** The parsed tokens. */ 215 private String[] tokens; 216 217 /** The current iteration position. */ 218 private int tokenPos; 219 220 /** The delimiter matcher. */ 221 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 222 223 /** The quote matcher. */ 224 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 225 226 /** The ignored matcher. */ 227 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 228 229 /** The trimmer matcher. */ 230 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 231 232 /** Whether to return empty tokens as null. */ 233 private boolean emptyAsNull; 234 235 /** Whether to ignore empty tokens. */ 236 private boolean ignoreEmptyTokens = true; 237 238 /** 239 * Constructs a tokenizer splitting on space, tab, newline and form feed 240 * as per StringTokenizer, but with no text to tokenize. 241 * <p> 242 * This constructor is normally used with {@link #reset(String)}. 243 * </p> 244 */ 245 public StrTokenizer() { 246 this.chars = null; 247 } 248 249 /** 250 * Constructs a tokenizer splitting on space, tab, newline and form feed 251 * as per StringTokenizer. 252 * 253 * @param input the string which is to be parsed, not cloned 254 */ 255 public StrTokenizer(final char[] input) { 256 if (input == null) { 257 this.chars = null; 258 } else { 259 this.chars = input.clone(); 260 } 261 } 262 263 /** 264 * Constructs a tokenizer splitting on the specified character. 265 * 266 * @param input the string which is to be parsed, not cloned 267 * @param delim the field delimiter character 268 */ 269 public StrTokenizer(final char[] input, final char delim) { 270 this(input); 271 setDelimiterChar(delim); 272 } 273 274 /** 275 * Constructs a tokenizer splitting on the specified delimiter character 276 * and handling quotes using the specified quote character. 277 * 278 * @param input the string which is to be parsed, not cloned 279 * @param delim the field delimiter character 280 * @param quote the field quoted string character 281 */ 282 public StrTokenizer(final char[] input, final char delim, final char quote) { 283 this(input, delim); 284 setQuoteChar(quote); 285 } 286 287 /** 288 * Constructs a tokenizer splitting on the specified string. 289 * 290 * @param input the string which is to be parsed, not cloned 291 * @param delim the field delimiter string 292 */ 293 public StrTokenizer(final char[] input, final String delim) { 294 this(input); 295 setDelimiterString(delim); 296 } 297 298 /** 299 * Constructs a tokenizer splitting using the specified delimiter matcher. 300 * 301 * @param input the string which is to be parsed, not cloned 302 * @param delim the field delimiter matcher 303 */ 304 public StrTokenizer(final char[] input, final StrMatcher delim) { 305 this(input); 306 setDelimiterMatcher(delim); 307 } 308 309 /** 310 * Constructs a tokenizer splitting using the specified delimiter matcher 311 * and handling quotes using the specified quote matcher. 312 * 313 * @param input the string which is to be parsed, not cloned 314 * @param delim the field delimiter character 315 * @param quote the field quoted string character 316 */ 317 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 318 this(input, delim); 319 setQuoteMatcher(quote); 320 } 321 322 /** 323 * Constructs a tokenizer splitting on space, tab, newline and form feed 324 * as per StringTokenizer. 325 * 326 * @param input the string which is to be parsed 327 */ 328 public StrTokenizer(final String input) { 329 if (input != null) { 330 chars = input.toCharArray(); 331 } else { 332 chars = null; 333 } 334 } 335 336 /** 337 * Constructs a tokenizer splitting on the specified delimiter character. 338 * 339 * @param input the string which is to be parsed 340 * @param delim the field delimiter character 341 */ 342 public StrTokenizer(final String input, final char delim) { 343 this(input); 344 setDelimiterChar(delim); 345 } 346 347 /** 348 * Constructs a tokenizer splitting on the specified delimiter character 349 * and handling quotes using the specified quote character. 350 * 351 * @param input the string which is to be parsed 352 * @param delim the field delimiter character 353 * @param quote the field quoted string character 354 */ 355 public StrTokenizer(final String input, final char delim, final char quote) { 356 this(input, delim); 357 setQuoteChar(quote); 358 } 359 360 /** 361 * Constructs a tokenizer splitting on the specified delimiter string. 362 * 363 * @param input the string which is to be parsed 364 * @param delim the field delimiter string 365 */ 366 public StrTokenizer(final String input, final String delim) { 367 this(input); 368 setDelimiterString(delim); 369 } 370 371 /** 372 * Constructs a tokenizer splitting using the specified delimiter matcher. 373 * 374 * @param input the string which is to be parsed 375 * @param delim the field delimiter matcher 376 */ 377 public StrTokenizer(final String input, final StrMatcher delim) { 378 this(input); 379 setDelimiterMatcher(delim); 380 } 381 382 /** 383 * Constructs a tokenizer splitting using the specified delimiter matcher 384 * and handling quotes using the specified quote matcher. 385 * 386 * @param input the string which is to be parsed 387 * @param delim the field delimiter matcher 388 * @param quote the field quoted string matcher 389 */ 390 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 391 this(input, delim); 392 setQuoteMatcher(quote); 393 } 394 395 /** 396 * Unsupported ListIterator operation. 397 * @param obj this parameter ignored. 398 * @throws UnsupportedOperationException always 399 */ 400 @Override 401 public void add(final String obj) { 402 throw new UnsupportedOperationException("add() is unsupported"); 403 } 404 405 /** 406 * Adds a token to a list, paying attention to the parameters we've set. 407 * 408 * @param list the list to add to 409 * @param tok the token to add 410 */ 411 private void addToken(final List<String> list, String tok) { 412 if (tok == null || tok.isEmpty()) { 413 if (isIgnoreEmptyTokens()) { 414 return; 415 } 416 if (isEmptyTokenAsNull()) { 417 tok = null; 418 } 419 } 420 list.add(tok); 421 } 422 423 /** 424 * Checks if tokenization has been done, and if not then do it. 425 */ 426 private void checkTokenized() { 427 if (tokens == null) { 428 if (chars == null) { 429 // still call tokenize as subclass may do some work 430 final List<String> split = tokenize(null, 0, 0); 431 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 432 } else { 433 final List<String> split = tokenize(chars, 0, chars.length); 434 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 435 } 436 } 437 } 438 439 /** 440 * Creates a new instance of this Tokenizer. The new instance is reset so 441 * that it will be at the start of the token list. 442 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 443 * 444 * @return a new instance of this Tokenizer which has been reset. 445 */ 446 @Override 447 public Object clone() { 448 try { 449 return cloneReset(); 450 } catch (final CloneNotSupportedException ex) { 451 return null; 452 } 453 } 454 455 /** 456 * Creates a new instance of this Tokenizer. The new instance is reset so that 457 * it will be at the start of the token list. 458 * 459 * @return a new instance of this Tokenizer which has been reset. 460 * @throws CloneNotSupportedException if there is a problem cloning 461 */ 462 Object cloneReset() throws CloneNotSupportedException { 463 // this method exists to enable 100% test coverage 464 final StrTokenizer cloned = (StrTokenizer) super.clone(); 465 if (cloned.chars != null) { 466 cloned.chars = cloned.chars.clone(); 467 } 468 cloned.reset(); 469 return cloned; 470 } 471 472 /** 473 * Gets the String content that the tokenizer is parsing. 474 * 475 * @return The string content being parsed 476 */ 477 public String getContent() { 478 if (chars == null) { 479 return null; 480 } 481 return new String(chars); 482 } 483 484 /** 485 * Gets the field delimiter matcher. 486 * 487 * @return The delimiter matcher in use 488 */ 489 public StrMatcher getDelimiterMatcher() { 490 return this.delimMatcher; 491 } 492 493 /** 494 * Gets the ignored character matcher. 495 * <p> 496 * These characters are ignored when parsing the String, unless they are 497 * within a quoted region. 498 * The default value is not to ignore anything. 499 * </p> 500 * 501 * @return The ignored matcher in use 502 */ 503 public StrMatcher getIgnoredMatcher() { 504 return ignoredMatcher; 505 } 506 507 /** 508 * Gets the quote matcher currently in use. 509 * <p> 510 * The quote character is used to wrap data between the tokens. 511 * This enables delimiters to be entered as data. 512 * The default value is '"' (double quote). 513 * </p> 514 * 515 * @return The quote matcher in use 516 */ 517 public StrMatcher getQuoteMatcher() { 518 return quoteMatcher; 519 } 520 521 /** 522 * Gets a copy of the full token list as an independent modifiable array. 523 * 524 * @return The tokens as a String array 525 */ 526 public String[] getTokenArray() { 527 checkTokenized(); 528 return tokens.clone(); 529 } 530 531 /** 532 * Gets a copy of the full token list as an independent modifiable list. 533 * 534 * @return The tokens as a String array 535 */ 536 public List<String> getTokenList() { 537 checkTokenized(); 538 final List<String> list = new ArrayList<>(tokens.length); 539 Collections.addAll(list, tokens); 540 541 return list; 542 } 543 544 /** 545 * Gets the trimmer character matcher. 546 * <p> 547 * These characters are trimmed off on each side of the delimiter 548 * until the token or quote is found. 549 * The default value is not to trim anything. 550 * </p> 551 * 552 * @return The trimmer matcher in use 553 */ 554 public StrMatcher getTrimmerMatcher() { 555 return trimmerMatcher; 556 } 557 558 /** 559 * Checks whether there are any more tokens. 560 * 561 * @return true if there are more tokens 562 */ 563 @Override 564 public boolean hasNext() { 565 checkTokenized(); 566 return tokenPos < tokens.length; 567 } 568 569 /** 570 * Checks whether there are any previous tokens that can be iterated to. 571 * 572 * @return true if there are previous tokens 573 */ 574 @Override 575 public boolean hasPrevious() { 576 checkTokenized(); 577 return tokenPos > 0; 578 } 579 580 /** 581 * Gets whether the tokenizer currently returns empty tokens as null. 582 * The default for this property is false. 583 * 584 * @return true if empty tokens are returned as null 585 */ 586 public boolean isEmptyTokenAsNull() { 587 return this.emptyAsNull; 588 } 589 590 /** 591 * Gets whether the tokenizer currently ignores empty tokens. 592 * The default for this property is true. 593 * 594 * @return true if empty tokens are not returned 595 */ 596 public boolean isIgnoreEmptyTokens() { 597 return ignoreEmptyTokens; 598 } 599 600 /** 601 * Checks if the characters at the index specified match the quote 602 * already matched in readNextToken(). 603 * 604 * @param srcChars the character array being tokenized 605 * @param pos the position to check for a quote 606 * @param len the length of the character array being tokenized 607 * @param quoteStart the start position of the matched quote, 0 if no quoting 608 * @param quoteLen the length of the matched quote, 0 if no quoting 609 * @return true if a quote is matched 610 */ 611 private boolean isQuote(final char[] srcChars, 612 final int pos, 613 final int len, 614 final int quoteStart, 615 final int quoteLen) { 616 for (int i = 0; i < quoteLen; i++) { 617 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 618 return false; 619 } 620 } 621 return true; 622 } 623 624 /** 625 * Gets the next token. 626 * 627 * @return The next String token 628 * @throws NoSuchElementException if there are no more elements 629 */ 630 @Override 631 public String next() { 632 if (hasNext()) { 633 return tokens[tokenPos++]; 634 } 635 throw new NoSuchElementException(); 636 } 637 638 /** 639 * Gets the index of the next token to return. 640 * 641 * @return The next token index 642 */ 643 @Override 644 public int nextIndex() { 645 return tokenPos; 646 } 647 648 /** 649 * Gets the next token from the String. 650 * Equivalent to {@link #next()} except it returns null rather than 651 * throwing {@link NoSuchElementException} when no tokens remain. 652 * 653 * @return The next sequential token, or null when no more tokens are found 654 */ 655 public String nextToken() { 656 if (hasNext()) { 657 return tokens[tokenPos++]; 658 } 659 return null; 660 } 661 662 /** 663 * Gets the token previous to the last returned token. 664 * 665 * @return The previous token 666 */ 667 @Override 668 public String previous() { 669 if (hasPrevious()) { 670 return tokens[--tokenPos]; 671 } 672 throw new NoSuchElementException(); 673 } 674 675 /** 676 * Gets the index of the previous token. 677 * 678 * @return The previous token index 679 */ 680 @Override 681 public int previousIndex() { 682 return tokenPos - 1; 683 } 684 685 /** 686 * Gets the previous token from the String. 687 * 688 * @return The previous sequential token, or null when no more tokens are found 689 */ 690 public String previousToken() { 691 if (hasPrevious()) { 692 return tokens[--tokenPos]; 693 } 694 return null; 695 } 696 697 /** 698 * Reads character by character through the String to get the next token. 699 * 700 * @param srcChars the character array being tokenized 701 * @param start the first character of field 702 * @param len the length of the character array being tokenized 703 * @param workArea a temporary work area 704 * @param tokenList the list of parsed tokens 705 * @return The starting position of the next field (the character 706 * immediately after the delimiter), or -1 if end of string found 707 */ 708 private int readNextToken(final char[] srcChars, 709 int start, 710 final int len, 711 final StrBuilder workArea, 712 final List<String> tokenList) { 713 // skip all leading whitespace, unless it is the 714 // field delimiter or the quote character 715 while (start < len) { 716 final int removeLen = Math.max( 717 getIgnoredMatcher().isMatch(srcChars, start, start, len), 718 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 719 if (removeLen == 0 720 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 721 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 722 break; 723 } 724 start += removeLen; 725 } 726 727 // handle reaching end 728 if (start >= len) { 729 addToken(tokenList, StringUtils.EMPTY); 730 return -1; 731 } 732 733 // handle empty token 734 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 735 if (delimLen > 0) { 736 addToken(tokenList, StringUtils.EMPTY); 737 return start + delimLen; 738 } 739 740 // handle found token 741 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 742 if (quoteLen > 0) { 743 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 744 } 745 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 746 } 747 748 /** 749 * Reads a possibly quoted string token. 750 * 751 * @param srcChars the character array being tokenized 752 * @param start the first character of field 753 * @param len the length of the character array being tokenized 754 * @param workArea a temporary work area 755 * @param tokenList the list of parsed tokens 756 * @param quoteStart the start position of the matched quote, 0 if no quoting 757 * @param quoteLen the length of the matched quote, 0 if no quoting 758 * @return The starting position of the next field (the character 759 * immediately after the delimiter, or if end of string found, 760 * then the length of string 761 */ 762 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 763 final List<String> tokenList, final int quoteStart, final int quoteLen) { 764 // Loop until we've found the end of the quoted 765 // string or the end of the input 766 workArea.clear(); 767 int pos = start; 768 boolean quoting = quoteLen > 0; 769 int trimStart = 0; 770 771 while (pos < len) { 772 // quoting mode can occur several times throughout a string 773 // we must switch between quoting and non-quoting until we 774 // encounter a non-quoted delimiter, or end of string 775 if (quoting) { 776 // In quoting mode 777 778 // If we've found a quote character, see if it's 779 // followed by a second quote. If so, then we need 780 // to actually put the quote character into the token 781 // rather than end the token. 782 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 783 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 784 // matched pair of quotes, thus an escaped quote 785 workArea.append(srcChars, pos, quoteLen); 786 pos += quoteLen * 2; 787 trimStart = workArea.size(); 788 continue; 789 } 790 791 // end of quoting 792 quoting = false; 793 pos += quoteLen; 794 continue; 795 } 796 797 } else { 798 // Not in quoting mode 799 800 // check for delimiter, and thus end of token 801 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 802 if (delimLen > 0) { 803 // return condition when end of token found 804 addToken(tokenList, workArea.substring(0, trimStart)); 805 return pos + delimLen; 806 } 807 808 // check for quote, and thus back into quoting mode 809 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 810 quoting = true; 811 pos += quoteLen; 812 continue; 813 } 814 815 // check for ignored (outside quotes), and ignore 816 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 817 if (ignoredLen > 0) { 818 pos += ignoredLen; 819 continue; 820 } 821 822 // check for trimmed character 823 // don't yet know if its at the end, so copy to workArea 824 // use trimStart to keep track of trim at the end 825 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 826 if (trimmedLen > 0) { 827 workArea.append(srcChars, pos, trimmedLen); 828 pos += trimmedLen; 829 continue; 830 } 831 832 } 833 // copy regular character from inside quotes 834 workArea.append(srcChars[pos++]); 835 trimStart = workArea.size(); 836 } 837 838 // return condition when end of string found 839 addToken(tokenList, workArea.substring(0, trimStart)); 840 return -1; 841 } 842 843 /** 844 * Unsupported ListIterator operation. 845 * 846 * @throws UnsupportedOperationException always 847 */ 848 @Override 849 public void remove() { 850 throw new UnsupportedOperationException("remove() is unsupported"); 851 } 852 853 /** 854 * Resets this tokenizer, forgetting all parsing and iteration already completed. 855 * <p> 856 * This method allows the same tokenizer to be reused for the same String. 857 * 858 * @return this, to enable chaining 859 */ 860 public StrTokenizer reset() { 861 tokenPos = 0; 862 tokens = null; 863 return this; 864 } 865 866 /** 867 * Reset this tokenizer, giving it a new input string to parse. 868 * In this manner you can re-use a tokenizer with the same settings 869 * on multiple input lines. 870 * 871 * @param input the new character array to tokenize, not cloned, null sets no text to parse 872 * @return this, to enable chaining 873 */ 874 public StrTokenizer reset(final char[] input) { 875 reset(); 876 if (input != null) { 877 this.chars = input.clone(); 878 } else { 879 this.chars = null; 880 } 881 return this; 882 } 883 884 /** 885 * Reset this tokenizer, giving it a new input string to parse. 886 * In this manner you can re-use a tokenizer with the same settings 887 * on multiple input lines. 888 * 889 * @param input the new string to tokenize, null sets no text to parse 890 * @return this, to enable chaining 891 */ 892 public StrTokenizer reset(final String input) { 893 reset(); 894 if (input != null) { 895 this.chars = input.toCharArray(); 896 } else { 897 this.chars = null; 898 } 899 return this; 900 } 901 902 /** 903 * Unsupported ListIterator operation. 904 * @param obj this parameter ignored. 905 * @throws UnsupportedOperationException always 906 */ 907 @Override 908 public void set(final String obj) { 909 throw new UnsupportedOperationException("set() is unsupported"); 910 } 911 912 /** 913 * Sets the field delimiter character. 914 * 915 * @param delim the delimiter character to use 916 * @return this, to enable chaining 917 */ 918 public StrTokenizer setDelimiterChar(final char delim) { 919 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 920 } 921 922 /** 923 * Sets the field delimiter matcher. 924 * <p> 925 * The delimiter is used to separate one token from another. 926 * </p> 927 * 928 * @param delim the delimiter matcher to use 929 * @return this, to enable chaining 930 */ 931 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 932 if (delim == null) { 933 this.delimMatcher = StrMatcher.noneMatcher(); 934 } else { 935 this.delimMatcher = delim; 936 } 937 return this; 938 } 939 940 /** 941 * Sets the field delimiter string. 942 * 943 * @param delim the delimiter string to use 944 * @return this, to enable chaining 945 */ 946 public StrTokenizer setDelimiterString(final String delim) { 947 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 948 } 949 950 /** 951 * Sets whether the tokenizer should return empty tokens as null. 952 * The default for this property is false. 953 * 954 * @param emptyAsNull whether empty tokens are returned as null 955 * @return this, to enable chaining 956 */ 957 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 958 this.emptyAsNull = emptyAsNull; 959 return this; 960 } 961 962 /** 963 * Sets the character to ignore. 964 * <p> 965 * This character is ignored when parsing the String, unless it is 966 * within a quoted region. 967 * </p> 968 * 969 * @param ignored the ignored character to use 970 * @return this, to enable chaining 971 */ 972 public StrTokenizer setIgnoredChar(final char ignored) { 973 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 974 } 975 976 /** 977 * Sets the matcher for characters to ignore. 978 * <p> 979 * These characters are ignored when parsing the String, unless they are 980 * within a quoted region. 981 * </p> 982 * 983 * @param ignored the ignored matcher to use, null ignored 984 * @return this, to enable chaining 985 */ 986 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 987 if (ignored != null) { 988 this.ignoredMatcher = ignored; 989 } 990 return this; 991 } 992 993 /** 994 * Sets whether the tokenizer should ignore and not return empty tokens. 995 * The default for this property is true. 996 * 997 * @param ignoreEmptyTokens whether empty tokens are not returned 998 * @return this, to enable chaining 999 */ 1000 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1001 this.ignoreEmptyTokens = ignoreEmptyTokens; 1002 return this; 1003 } 1004 1005 /** 1006 * Sets the quote character to use. 1007 * <p> 1008 * The quote character is used to wrap data between the tokens. 1009 * This enables delimiters to be entered as data. 1010 * </p> 1011 * 1012 * @param quote the quote character to use 1013 * @return this, to enable chaining 1014 */ 1015 public StrTokenizer setQuoteChar(final char quote) { 1016 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 1017 } 1018 1019 /** 1020 * Sets the quote matcher to use. 1021 * <p> 1022 * The quote character is used to wrap data between the tokens. 1023 * This enables delimiters to be entered as data. 1024 * </p> 1025 * 1026 * @param quote the quote matcher to use, null ignored 1027 * @return this, to enable chaining 1028 */ 1029 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1030 if (quote != null) { 1031 this.quoteMatcher = quote; 1032 } 1033 return this; 1034 } 1035 1036 /** 1037 * Sets the matcher for characters to trim. 1038 * <p> 1039 * These characters are trimmed off on each side of the delimiter 1040 * until the token or quote is found. 1041 * </p> 1042 * 1043 * @param trimmer the trimmer matcher to use, null ignored 1044 * @return this, to enable chaining 1045 */ 1046 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1047 if (trimmer != null) { 1048 this.trimmerMatcher = trimmer; 1049 } 1050 return this; 1051 } 1052 1053 /** 1054 * Gets the number of tokens found in the String. 1055 * 1056 * @return The number of matched tokens 1057 */ 1058 public int size() { 1059 checkTokenized(); 1060 return tokens.length; 1061 } 1062 1063 /** 1064 * Internal method to performs the tokenization. 1065 * <p> 1066 * Most users of this class do not need to call this method. This method 1067 * will be called automatically by other (public) methods when required. 1068 * </p> 1069 * <p> 1070 * This method exists to allow subclasses to add code before or after the 1071 * tokenization. For example, a subclass could alter the character array, 1072 * offset or count to be parsed, or call the tokenizer multiple times on 1073 * multiple strings. It is also be possible to filter the results. 1074 * </p> 1075 * <p> 1076 * {@code StrTokenizer} will always pass a zero offset and a count 1077 * equal to the length of the array to this method, however a subclass 1078 * may pass other values, or even an entirely different array. 1079 * </p> 1080 * 1081 * @param srcChars the character array being tokenized, may be null 1082 * @param offset the start position within the character array, must be valid 1083 * @param count the number of characters to tokenize, must be valid 1084 * @return The modifiable list of String tokens, unmodifiable if null array or zero count 1085 */ 1086 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1087 if (srcChars == null || count == 0) { 1088 return Collections.emptyList(); 1089 } 1090 final StrBuilder buf = new StrBuilder(); 1091 final List<String> tokenList = new ArrayList<>(); 1092 int pos = offset; 1093 1094 // loop around the entire buffer 1095 while (pos >= 0 && pos < count) { 1096 // find next token 1097 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1098 1099 // handle case where end of string is a delimiter 1100 if (pos >= count) { 1101 addToken(tokenList, StringUtils.EMPTY); 1102 } 1103 } 1104 return tokenList; 1105 } 1106 1107 /** 1108 * Gets the String content that the tokenizer is parsing. 1109 * 1110 * @return The string content being parsed 1111 */ 1112 @Override 1113 public String toString() { 1114 if (tokens == null) { 1115 return "StrTokenizer[not tokenized yet]"; 1116 } 1117 return "StrTokenizer" + getTokenList(); 1118 } 1119 1120 }