1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text; 18 19 import java.util.ArrayList; 20 import java.util.Collections; 21 import java.util.List; 22 import java.util.ListIterator; 23 import java.util.NoSuchElementException; 24 25 import org.apache.commons.lang3.ArrayUtils; 26 import org.apache.commons.lang3.StringUtils; 27 28 /** 29 * Tokenizes a string based on delimiters (separators) 30 * and supporting quoting and ignored character concepts. 31 * <p> 32 * This class can split a String into many smaller strings. It aims 33 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 34 * however it offers much more control and flexibility including implementing 35 * the {@code ListIterator} interface. By default, it is set up 36 * like {@code StringTokenizer}. 37 * <p> 38 * The input String is split into a number of <em>tokens</em>. 39 * Each token is separated from the next String by a <em>delimiter</em>. 40 * One or more delimiter characters must be specified. 41 * <p> 42 * Each token may be surrounded by quotes. 43 * The <em>quote</em> matcher specifies the quote character(s). 44 * A quote may be escaped within a quoted section by duplicating itself. 45 * <p> 46 * Between each token and the delimiter are potentially characters that need trimming. 47 * The <em>trimmer</em> matcher specifies these characters. 48 * One usage might be to trim whitespace characters. 49 * <p> 50 * At any point outside the quotes there might potentially be invalid characters. 51 * The <em>ignored</em> matcher specifies these characters to be removed. 52 * One usage might be to remove new line characters. 53 * <p> 54 * Empty tokens may be removed or returned as null. 55 * <pre> 56 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 57 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 58 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 59 * </pre> 60 * 61 * <table> 62 * <caption>StrTokenizer properties and options</caption> 63 * <tr> 64 * <th>Property</th><th>Type</th><th>Default</th> 65 * </tr> 66 * <tr> 67 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 68 * </tr> 69 * <tr> 70 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 71 * </tr> 72 * <tr> 73 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 74 * </tr> 75 * <tr> 76 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 77 * </tr> 78 * <tr> 79 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 80 * </tr> 81 * </table> 82 * 83 * @since 1.0 84 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0. 85 */ 86 @Deprecated 87 public class StrTokenizer implements ListIterator<String>, Cloneable { 88 89 /** Comma separated values tokenizer internal variable. */ 90 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 91 92 /** Tab separated values tokenizer internal variable. */ 93 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 94 95 static { 96 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 97 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 98 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 99 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 101 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 102 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 103 104 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 105 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 107 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 109 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 110 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 111 } 112 113 /** 114 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 115 * 116 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 117 */ 118 private static StrTokenizer getCSVClone() { 119 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 120 } 121 122 /** 123 * Gets a new tokenizer instance which parses Comma Separated Value strings 124 * initializing it with the given input. The default for CSV processing 125 * will be trim whitespace from both ends (which can be overridden with 126 * the setTrimmer method). 127 * <p> 128 * You must call a "reset" method to set the string which you want to parse. 129 * </p> 130 * @return a new tokenizer instance which parses Comma Separated Value strings 131 */ 132 public static StrTokenizer getCSVInstance() { 133 return getCSVClone(); 134 } 135 136 /** 137 * Gets a new tokenizer instance which parses Comma Separated Value strings 138 * initializing it with the given input. The default for CSV processing 139 * will be trim whitespace from both ends (which can be overridden with 140 * the setTrimmer method). 141 * 142 * @param input the text to parse 143 * @return a new tokenizer instance which parses Comma Separated Value strings 144 */ 145 public static StrTokenizer getCSVInstance(final char[] input) { 146 final StrTokenizer tok = getCSVClone(); 147 tok.reset(input); 148 return tok; 149 } 150 151 /** 152 * Gets a new tokenizer instance which parses Comma Separated Value strings 153 * initializing it with the given input. The default for CSV processing 154 * will be trim whitespace from both ends (which can be overridden with 155 * the setTrimmer method). 156 * 157 * @param input the text to parse 158 * @return a new tokenizer instance which parses Comma Separated Value strings 159 */ 160 public static StrTokenizer getCSVInstance(final String input) { 161 final StrTokenizer tok = getCSVClone(); 162 tok.reset(input); 163 return tok; 164 } 165 /** 166 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 167 * 168 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 169 */ 170 private static StrTokenizer getTSVClone() { 171 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 172 } 173 174 /** 175 * Gets a new tokenizer instance which parses Tab Separated Value strings. 176 * The default for CSV processing will be trim whitespace from both ends 177 * (which can be overridden with the setTrimmer method). 178 * <p> 179 * You must call a "reset" method to set the string which you want to parse. 180 * </p> 181 * @return a new tokenizer instance which parses Tab Separated Value strings. 182 */ 183 public static StrTokenizer getTSVInstance() { 184 return getTSVClone(); 185 } 186 187 /** 188 * Gets a new tokenizer instance which parses Tab Separated Value strings. 189 * The default for CSV processing will be trim whitespace from both ends 190 * (which can be overridden with the setTrimmer method). 191 * @param input the string to parse 192 * @return a new tokenizer instance which parses Tab Separated Value strings. 193 */ 194 public static StrTokenizer getTSVInstance(final char[] input) { 195 final StrTokenizer tok = getTSVClone(); 196 tok.reset(input); 197 return tok; 198 } 199 200 /** 201 * Gets a new tokenizer instance which parses Tab Separated Value strings. 202 * The default for CSV processing will be trim whitespace from both ends 203 * (which can be overridden with the setTrimmer method). 204 * @param input the string to parse 205 * @return a new tokenizer instance which parses Tab Separated Value strings. 206 */ 207 public static StrTokenizer getTSVInstance(final String input) { 208 final StrTokenizer tok = getTSVClone(); 209 tok.reset(input); 210 return tok; 211 } 212 213 /** The text to work on. */ 214 private char[] chars; 215 216 /** The parsed tokens. */ 217 private String[] tokens; 218 219 /** The current iteration position. */ 220 private int tokenPos; 221 222 /** The delimiter matcher. */ 223 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 224 225 /** The quote matcher. */ 226 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 227 228 /** The ignored matcher. */ 229 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 230 231 /** The trimmer matcher. */ 232 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 233 234 /** Whether to return empty tokens as null. */ 235 private boolean emptyAsNull; 236 237 /** Whether to ignore empty tokens. */ 238 private boolean ignoreEmptyTokens = true; 239 240 /** 241 * Constructs a tokenizer splitting on space, tab, newline and form feed 242 * as per StringTokenizer, but with no text to tokenize. 243 * <p> 244 * This constructor is normally used with {@link #reset(String)}. 245 * </p> 246 */ 247 public StrTokenizer() { 248 this.chars = null; 249 } 250 251 /** 252 * Constructs a tokenizer splitting on space, tab, newline and form feed 253 * as per StringTokenizer. 254 * 255 * @param input the string which is to be parsed, not cloned 256 */ 257 public StrTokenizer(final char[] input) { 258 if (input == null) { 259 this.chars = null; 260 } else { 261 this.chars = input.clone(); 262 } 263 } 264 265 /** 266 * Constructs a tokenizer splitting on the specified character. 267 * 268 * @param input the string which is to be parsed, not cloned 269 * @param delim the field delimiter character 270 */ 271 public StrTokenizer(final char[] input, final char delim) { 272 this(input); 273 setDelimiterChar(delim); 274 } 275 276 /** 277 * Constructs a tokenizer splitting on the specified delimiter character 278 * and handling quotes using the specified quote character. 279 * 280 * @param input the string which is to be parsed, not cloned 281 * @param delim the field delimiter character 282 * @param quote the field quoted string character 283 */ 284 public StrTokenizer(final char[] input, final char delim, final char quote) { 285 this(input, delim); 286 setQuoteChar(quote); 287 } 288 289 /** 290 * Constructs a tokenizer splitting on the specified string. 291 * 292 * @param input the string which is to be parsed, not cloned 293 * @param delim the field delimiter string 294 */ 295 public StrTokenizer(final char[] input, final String delim) { 296 this(input); 297 setDelimiterString(delim); 298 } 299 300 /** 301 * Constructs a tokenizer splitting using the specified delimiter matcher. 302 * 303 * @param input the string which is to be parsed, not cloned 304 * @param delim the field delimiter matcher 305 */ 306 public StrTokenizer(final char[] input, final StrMatcher delim) { 307 this(input); 308 setDelimiterMatcher(delim); 309 } 310 311 /** 312 * Constructs a tokenizer splitting using the specified delimiter matcher 313 * and handling quotes using the specified quote matcher. 314 * 315 * @param input the string which is to be parsed, not cloned 316 * @param delim the field delimiter character 317 * @param quote the field quoted string character 318 */ 319 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 320 this(input, delim); 321 setQuoteMatcher(quote); 322 } 323 324 /** 325 * Constructs a tokenizer splitting on space, tab, newline and form feed 326 * as per StringTokenizer. 327 * 328 * @param input the string which is to be parsed 329 */ 330 public StrTokenizer(final String input) { 331 if (input != null) { 332 chars = input.toCharArray(); 333 } else { 334 chars = null; 335 } 336 } 337 338 /** 339 * Constructs a tokenizer splitting on the specified delimiter character. 340 * 341 * @param input the string which is to be parsed 342 * @param delim the field delimiter character 343 */ 344 public StrTokenizer(final String input, final char delim) { 345 this(input); 346 setDelimiterChar(delim); 347 } 348 349 /** 350 * Constructs a tokenizer splitting on the specified delimiter character 351 * and handling quotes using the specified quote character. 352 * 353 * @param input the string which is to be parsed 354 * @param delim the field delimiter character 355 * @param quote the field quoted string character 356 */ 357 public StrTokenizer(final String input, final char delim, final char quote) { 358 this(input, delim); 359 setQuoteChar(quote); 360 } 361 362 /** 363 * Constructs a tokenizer splitting on the specified delimiter string. 364 * 365 * @param input the string which is to be parsed 366 * @param delim the field delimiter string 367 */ 368 public StrTokenizer(final String input, final String delim) { 369 this(input); 370 setDelimiterString(delim); 371 } 372 373 /** 374 * Constructs a tokenizer splitting using the specified delimiter matcher. 375 * 376 * @param input the string which is to be parsed 377 * @param delim the field delimiter matcher 378 */ 379 public StrTokenizer(final String input, final StrMatcher delim) { 380 this(input); 381 setDelimiterMatcher(delim); 382 } 383 384 /** 385 * Constructs a tokenizer splitting using the specified delimiter matcher 386 * and handling quotes using the specified quote matcher. 387 * 388 * @param input the string which is to be parsed 389 * @param delim the field delimiter matcher 390 * @param quote the field quoted string matcher 391 */ 392 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 393 this(input, delim); 394 setQuoteMatcher(quote); 395 } 396 397 /** 398 * Unsupported ListIterator operation. 399 * @param obj this parameter ignored. 400 * @throws UnsupportedOperationException always 401 */ 402 @Override 403 public void add(final String obj) { 404 throw new UnsupportedOperationException("add() is unsupported"); 405 } 406 407 /** 408 * Adds a token to a list, paying attention to the parameters we've set. 409 * 410 * @param list the list to add to 411 * @param tok the token to add 412 */ 413 private void addToken(final List<String> list, String tok) { 414 if (tok == null || tok.isEmpty()) { 415 if (isIgnoreEmptyTokens()) { 416 return; 417 } 418 if (isEmptyTokenAsNull()) { 419 tok = null; 420 } 421 } 422 list.add(tok); 423 } 424 425 /** 426 * Checks if tokenization has been done, and if not then do it. 427 */ 428 private void checkTokenized() { 429 if (tokens == null) { 430 if (chars == null) { 431 // still call tokenize as subclass may do some work 432 final List<String> split = tokenize(null, 0, 0); 433 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 434 } else { 435 final List<String> split = tokenize(chars, 0, chars.length); 436 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 437 } 438 } 439 } 440 441 /** 442 * Creates a new instance of this Tokenizer. The new instance is reset so 443 * that it will be at the start of the token list. 444 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 445 * 446 * @return a new instance of this Tokenizer which has been reset. 447 */ 448 @Override 449 public Object clone() { 450 try { 451 return cloneReset(); 452 } catch (final CloneNotSupportedException ex) { 453 return null; 454 } 455 } 456 457 /** 458 * Creates a new instance of this Tokenizer. The new instance is reset so that 459 * it will be at the start of the token list. 460 * 461 * @return a new instance of this Tokenizer which has been reset. 462 * @throws CloneNotSupportedException if there is a problem cloning 463 */ 464 Object cloneReset() throws CloneNotSupportedException { 465 // this method exists to enable 100% test coverage 466 final StrTokenizer cloned = (StrTokenizer) super.clone(); 467 if (cloned.chars != null) { 468 cloned.chars = cloned.chars.clone(); 469 } 470 cloned.reset(); 471 return cloned; 472 } 473 474 /** 475 * Gets the String content that the tokenizer is parsing. 476 * 477 * @return The string content being parsed 478 */ 479 public String getContent() { 480 if (chars == null) { 481 return null; 482 } 483 return new String(chars); 484 } 485 486 /** 487 * Gets the field delimiter matcher. 488 * 489 * @return The delimiter matcher in use 490 */ 491 public StrMatcher getDelimiterMatcher() { 492 return this.delimMatcher; 493 } 494 495 /** 496 * Gets the ignored character matcher. 497 * <p> 498 * These characters are ignored when parsing the String, unless they are 499 * within a quoted region. 500 * The default value is not to ignore anything. 501 * </p> 502 * 503 * @return The ignored matcher in use 504 */ 505 public StrMatcher getIgnoredMatcher() { 506 return ignoredMatcher; 507 } 508 509 /** 510 * Gets the quote matcher currently in use. 511 * <p> 512 * The quote character is used to wrap data between the tokens. 513 * This enables delimiters to be entered as data. 514 * The default value is '"' (double quote). 515 * </p> 516 * 517 * @return The quote matcher in use 518 */ 519 public StrMatcher getQuoteMatcher() { 520 return quoteMatcher; 521 } 522 523 /** 524 * Gets a copy of the full token list as an independent modifiable array. 525 * 526 * @return The tokens as a String array 527 */ 528 public String[] getTokenArray() { 529 checkTokenized(); 530 return tokens.clone(); 531 } 532 533 /** 534 * Gets a copy of the full token list as an independent modifiable list. 535 * 536 * @return The tokens as a String array 537 */ 538 public List<String> getTokenList() { 539 checkTokenized(); 540 final List<String> list = new ArrayList<>(tokens.length); 541 Collections.addAll(list, tokens); 542 543 return list; 544 } 545 546 /** 547 * Gets the trimmer character matcher. 548 * <p> 549 * These characters are trimmed off on each side of the delimiter 550 * until the token or quote is found. 551 * The default value is not to trim anything. 552 * </p> 553 * 554 * @return The trimmer matcher in use 555 */ 556 public StrMatcher getTrimmerMatcher() { 557 return trimmerMatcher; 558 } 559 560 /** 561 * Checks whether there are any more tokens. 562 * 563 * @return true if there are more tokens 564 */ 565 @Override 566 public boolean hasNext() { 567 checkTokenized(); 568 return tokenPos < tokens.length; 569 } 570 571 /** 572 * Checks whether there are any previous tokens that can be iterated to. 573 * 574 * @return true if there are previous tokens 575 */ 576 @Override 577 public boolean hasPrevious() { 578 checkTokenized(); 579 return tokenPos > 0; 580 } 581 582 /** 583 * Gets whether the tokenizer currently returns empty tokens as null. 584 * The default for this property is false. 585 * 586 * @return true if empty tokens are returned as null 587 */ 588 public boolean isEmptyTokenAsNull() { 589 return this.emptyAsNull; 590 } 591 592 /** 593 * Gets whether the tokenizer currently ignores empty tokens. 594 * The default for this property is true. 595 * 596 * @return true if empty tokens are not returned 597 */ 598 public boolean isIgnoreEmptyTokens() { 599 return ignoreEmptyTokens; 600 } 601 602 /** 603 * Checks if the characters at the index specified match the quote 604 * already matched in readNextToken(). 605 * 606 * @param srcChars the character array being tokenized 607 * @param pos the position to check for a quote 608 * @param len the length of the character array being tokenized 609 * @param quoteStart the start position of the matched quote, 0 if no quoting 610 * @param quoteLen the length of the matched quote, 0 if no quoting 611 * @return true if a quote is matched 612 */ 613 private boolean isQuote(final char[] srcChars, 614 final int pos, 615 final int len, 616 final int quoteStart, 617 final int quoteLen) { 618 for (int i = 0; i < quoteLen; i++) { 619 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 620 return false; 621 } 622 } 623 return true; 624 } 625 626 /** 627 * Gets the next token. 628 * 629 * @return The next String token 630 * @throws NoSuchElementException if there are no more elements 631 */ 632 @Override 633 public String next() { 634 if (hasNext()) { 635 return tokens[tokenPos++]; 636 } 637 throw new NoSuchElementException(); 638 } 639 640 /** 641 * Gets the index of the next token to return. 642 * 643 * @return The next token index 644 */ 645 @Override 646 public int nextIndex() { 647 return tokenPos; 648 } 649 650 /** 651 * Gets the next token from the String. 652 * Equivalent to {@link #next()} except it returns null rather than 653 * throwing {@link NoSuchElementException} when no tokens remain. 654 * 655 * @return The next sequential token, or null when no more tokens are found 656 */ 657 public String nextToken() { 658 if (hasNext()) { 659 return tokens[tokenPos++]; 660 } 661 return null; 662 } 663 664 /** 665 * Gets the token previous to the last returned token. 666 * 667 * @return The previous token 668 */ 669 @Override 670 public String previous() { 671 if (hasPrevious()) { 672 return tokens[--tokenPos]; 673 } 674 throw new NoSuchElementException(); 675 } 676 677 /** 678 * Gets the index of the previous token. 679 * 680 * @return The previous token index 681 */ 682 @Override 683 public int previousIndex() { 684 return tokenPos - 1; 685 } 686 687 /** 688 * Gets the previous token from the String. 689 * 690 * @return The previous sequential token, or null when no more tokens are found 691 */ 692 public String previousToken() { 693 if (hasPrevious()) { 694 return tokens[--tokenPos]; 695 } 696 return null; 697 } 698 699 /** 700 * Reads character by character through the String to get the next token. 701 * 702 * @param srcChars the character array being tokenized 703 * @param start the first character of field 704 * @param len the length of the character array being tokenized 705 * @param workArea a temporary work area 706 * @param tokenList the list of parsed tokens 707 * @return The starting position of the next field (the character 708 * immediately after the delimiter), or -1 if end of string found 709 */ 710 private int readNextToken(final char[] srcChars, 711 int start, 712 final int len, 713 final StrBuilder workArea, 714 final List<String> tokenList) { 715 // skip all leading whitespace, unless it is the 716 // field delimiter or the quote character 717 while (start < len) { 718 final int removeLen = Math.max( 719 getIgnoredMatcher().isMatch(srcChars, start, start, len), 720 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 721 if (removeLen == 0 722 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 723 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 724 break; 725 } 726 start += removeLen; 727 } 728 729 // handle reaching end 730 if (start >= len) { 731 addToken(tokenList, StringUtils.EMPTY); 732 return -1; 733 } 734 735 // handle empty token 736 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 737 if (delimLen > 0) { 738 addToken(tokenList, StringUtils.EMPTY); 739 return start + delimLen; 740 } 741 742 // handle found token 743 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 744 if (quoteLen > 0) { 745 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 746 } 747 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 748 } 749 750 /** 751 * Reads a possibly quoted string token. 752 * 753 * @param srcChars the character array being tokenized 754 * @param start the first character of field 755 * @param len the length of the character array being tokenized 756 * @param workArea a temporary work area 757 * @param tokenList the list of parsed tokens 758 * @param quoteStart the start position of the matched quote, 0 if no quoting 759 * @param quoteLen the length of the matched quote, 0 if no quoting 760 * @return The starting position of the next field (the character 761 * immediately after the delimiter, or if end of string found, 762 * then the length of string 763 */ 764 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 765 final List<String> tokenList, final int quoteStart, final int quoteLen) { 766 // Loop until we've found the end of the quoted 767 // string or the end of the input 768 workArea.clear(); 769 int pos = start; 770 boolean quoting = quoteLen > 0; 771 int trimStart = 0; 772 773 while (pos < len) { 774 // quoting mode can occur several times throughout a string 775 // we must switch between quoting and non-quoting until we 776 // encounter a non-quoted delimiter, or end of string 777 if (quoting) { 778 // In quoting mode 779 780 // If we've found a quote character, see if it's 781 // followed by a second quote. If so, then we need 782 // to actually put the quote character into the token 783 // rather than end the token. 784 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 785 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 786 // matched pair of quotes, thus an escaped quote 787 workArea.append(srcChars, pos, quoteLen); 788 pos += quoteLen * 2; 789 trimStart = workArea.size(); 790 continue; 791 } 792 793 // end of quoting 794 quoting = false; 795 pos += quoteLen; 796 continue; 797 } 798 799 } else { 800 // Not in quoting mode 801 802 // check for delimiter, and thus end of token 803 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 804 if (delimLen > 0) { 805 // return condition when end of token found 806 addToken(tokenList, workArea.substring(0, trimStart)); 807 return pos + delimLen; 808 } 809 810 // check for quote, and thus back into quoting mode 811 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 812 quoting = true; 813 pos += quoteLen; 814 continue; 815 } 816 817 // check for ignored (outside quotes), and ignore 818 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 819 if (ignoredLen > 0) { 820 pos += ignoredLen; 821 continue; 822 } 823 824 // check for trimmed character 825 // don't yet know if its at the end, so copy to workArea 826 // use trimStart to keep track of trim at the end 827 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 828 if (trimmedLen > 0) { 829 workArea.append(srcChars, pos, trimmedLen); 830 pos += trimmedLen; 831 continue; 832 } 833 834 } 835 // copy regular character from inside quotes 836 workArea.append(srcChars[pos++]); 837 trimStart = workArea.size(); 838 } 839 840 // return condition when end of string found 841 addToken(tokenList, workArea.substring(0, trimStart)); 842 return -1; 843 } 844 845 /** 846 * Unsupported ListIterator operation. 847 * 848 * @throws UnsupportedOperationException always 849 */ 850 @Override 851 public void remove() { 852 throw new UnsupportedOperationException("remove() is unsupported"); 853 } 854 855 /** 856 * Resets this tokenizer, forgetting all parsing and iteration already completed. 857 * <p> 858 * This method allows the same tokenizer to be reused for the same String. 859 * 860 * @return this, to enable chaining 861 */ 862 public StrTokenizer reset() { 863 tokenPos = 0; 864 tokens = null; 865 return this; 866 } 867 868 /** 869 * Reset this tokenizer, giving it a new input string to parse. 870 * In this manner you can re-use a tokenizer with the same settings 871 * on multiple input lines. 872 * 873 * @param input the new character array to tokenize, not cloned, null sets no text to parse 874 * @return this, to enable chaining 875 */ 876 public StrTokenizer reset(final char[] input) { 877 reset(); 878 if (input != null) { 879 this.chars = input.clone(); 880 } else { 881 this.chars = null; 882 } 883 return this; 884 } 885 886 /** 887 * Reset this tokenizer, giving it a new input string to parse. 888 * In this manner you can re-use a tokenizer with the same settings 889 * on multiple input lines. 890 * 891 * @param input the new string to tokenize, null sets no text to parse 892 * @return this, to enable chaining 893 */ 894 public StrTokenizer reset(final String input) { 895 reset(); 896 if (input != null) { 897 this.chars = input.toCharArray(); 898 } else { 899 this.chars = null; 900 } 901 return this; 902 } 903 904 /** 905 * Unsupported ListIterator operation. 906 * @param obj this parameter ignored. 907 * @throws UnsupportedOperationException always 908 */ 909 @Override 910 public void set(final String obj) { 911 throw new UnsupportedOperationException("set() is unsupported"); 912 } 913 914 /** 915 * Sets the field delimiter character. 916 * 917 * @param delim the delimiter character to use 918 * @return this, to enable chaining 919 */ 920 public StrTokenizer setDelimiterChar(final char delim) { 921 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 922 } 923 924 /** 925 * Sets the field delimiter matcher. 926 * <p> 927 * The delimiter is used to separate one token from another. 928 * </p> 929 * 930 * @param delim the delimiter matcher to use 931 * @return this, to enable chaining 932 */ 933 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 934 if (delim == null) { 935 this.delimMatcher = StrMatcher.noneMatcher(); 936 } else { 937 this.delimMatcher = delim; 938 } 939 return this; 940 } 941 942 /** 943 * Sets the field delimiter string. 944 * 945 * @param delim the delimiter string to use 946 * @return this, to enable chaining 947 */ 948 public StrTokenizer setDelimiterString(final String delim) { 949 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 950 } 951 952 /** 953 * Sets whether the tokenizer should return empty tokens as null. 954 * The default for this property is false. 955 * 956 * @param emptyAsNull whether empty tokens are returned as null 957 * @return this, to enable chaining 958 */ 959 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 960 this.emptyAsNull = emptyAsNull; 961 return this; 962 } 963 964 /** 965 * Sets the character to ignore. 966 * <p> 967 * This character is ignored when parsing the String, unless it is 968 * within a quoted region. 969 * </p> 970 * 971 * @param ignored the ignored character to use 972 * @return this, to enable chaining 973 */ 974 public StrTokenizer setIgnoredChar(final char ignored) { 975 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 976 } 977 978 /** 979 * Sets the matcher for characters to ignore. 980 * <p> 981 * These characters are ignored when parsing the String, unless they are 982 * within a quoted region. 983 * </p> 984 * 985 * @param ignored the ignored matcher to use, null ignored 986 * @return this, to enable chaining 987 */ 988 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 989 if (ignored != null) { 990 this.ignoredMatcher = ignored; 991 } 992 return this; 993 } 994 995 /** 996 * Sets whether the tokenizer should ignore and not return empty tokens. 997 * The default for this property is true. 998 * 999 * @param ignoreEmptyTokens whether empty tokens are not returned 1000 * @return this, to enable chaining 1001 */ 1002 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1003 this.ignoreEmptyTokens = ignoreEmptyTokens; 1004 return this; 1005 } 1006 1007 /** 1008 * Sets the quote character to use. 1009 * <p> 1010 * The quote character is used to wrap data between the tokens. 1011 * This enables delimiters to be entered as data. 1012 * </p> 1013 * 1014 * @param quote the quote character to use 1015 * @return this, to enable chaining 1016 */ 1017 public StrTokenizer setQuoteChar(final char quote) { 1018 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 1019 } 1020 1021 /** 1022 * Sets the quote matcher to use. 1023 * <p> 1024 * The quote character is used to wrap data between the tokens. 1025 * This enables delimiters to be entered as data. 1026 * </p> 1027 * 1028 * @param quote the quote matcher to use, null ignored 1029 * @return this, to enable chaining 1030 */ 1031 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1032 if (quote != null) { 1033 this.quoteMatcher = quote; 1034 } 1035 return this; 1036 } 1037 1038 /** 1039 * Sets the matcher for characters to trim. 1040 * <p> 1041 * These characters are trimmed off on each side of the delimiter 1042 * until the token or quote is found. 1043 * </p> 1044 * 1045 * @param trimmer the trimmer matcher to use, null ignored 1046 * @return this, to enable chaining 1047 */ 1048 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1049 if (trimmer != null) { 1050 this.trimmerMatcher = trimmer; 1051 } 1052 return this; 1053 } 1054 1055 /** 1056 * Gets the number of tokens found in the String. 1057 * 1058 * @return The number of matched tokens 1059 */ 1060 public int size() { 1061 checkTokenized(); 1062 return tokens.length; 1063 } 1064 1065 /** 1066 * Internal method to performs the tokenization. 1067 * <p> 1068 * Most users of this class do not need to call this method. This method 1069 * will be called automatically by other (public) methods when required. 1070 * </p> 1071 * <p> 1072 * This method exists to allow subclasses to add code before or after the 1073 * tokenization. For example, a subclass could alter the character array, 1074 * offset or count to be parsed, or call the tokenizer multiple times on 1075 * multiple strings. It is also be possible to filter the results. 1076 * </p> 1077 * <p> 1078 * {@code StrTokenizer} will always pass a zero offset and a count 1079 * equal to the length of the array to this method, however a subclass 1080 * may pass other values, or even an entirely different array. 1081 * </p> 1082 * 1083 * @param srcChars the character array being tokenized, may be null 1084 * @param offset the start position within the character array, must be valid 1085 * @param count the number of characters to tokenize, must be valid 1086 * @return The modifiable list of String tokens, unmodifiable if null array or zero count 1087 */ 1088 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1089 if (srcChars == null || count == 0) { 1090 return Collections.emptyList(); 1091 } 1092 final StrBuilder buf = new StrBuilder(); 1093 final List<String> tokenList = new ArrayList<>(); 1094 int pos = offset; 1095 1096 // loop around the entire buffer 1097 while (pos >= 0 && pos < count) { 1098 // find next token 1099 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1100 1101 // handle case where end of string is a delimiter 1102 if (pos >= count) { 1103 addToken(tokenList, StringUtils.EMPTY); 1104 } 1105 } 1106 return tokenList; 1107 } 1108 1109 /** 1110 * Gets the String content that the tokenizer is parsing. 1111 * 1112 * @return The string content being parsed 1113 */ 1114 @Override 1115 public String toString() { 1116 if (tokens == null) { 1117 return "StrTokenizer[not tokenized yet]"; 1118 } 1119 return "StrTokenizer" + getTokenList(); 1120 } 1121 1122 }