001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025 026import org.apache.commons.lang3.ArrayUtils; 027import org.apache.commons.lang3.StringUtils; 028import org.apache.commons.text.matcher.StringMatcher; 029import org.apache.commons.text.matcher.StringMatcherFactory; 030 031/** 032 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts. 033 * <p> 034 * This class can split a String into many smaller strings. It aims to do a similar job to 035 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including 036 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}. 037 * <p> 038 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a 039 * <em>delimiter</em>. One or more delimiter characters must be specified. 040 * <p> 041 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be 042 * escaped within a quoted section by duplicating itself. 043 * <p> 044 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher 045 * specifies these characters. One usage might be to trim whitespace characters. 046 * <p> 047 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies 048 * these characters to be removed. One usage might be to remove new line characters. 049 * <p> 050 * Empty tokens may be removed or returned as null. 051 * 052 * <pre> 053 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 054 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 056 * </pre> 057 * 058 * <table> 059 * <caption>StringTokenizer properties and options</caption> 060 * <tr> 061 * <th>Property</th> 062 * <th>Type</th> 063 * <th>Default</th> 064 * </tr> 065 * <tr> 066 * <td>delim</td> 067 * <td>CharSetMatcher</td> 068 * <td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td> 072 * <td>NoneMatcher</td> 073 * <td>{}</td> 074 * </tr> 075 * <tr> 076 * <td>ignore</td> 077 * <td>NoneMatcher</td> 078 * <td>{}</td> 079 * </tr> 080 * <tr> 081 * <td>emptyTokenAsNull</td> 082 * <td>boolean</td> 083 * <td>false</td> 084 * </tr> 085 * <tr> 086 * <td>ignoreEmptyTokens</td> 087 * <td>boolean</td> 088 * <td>true</td> 089 * </tr> 090 * </table> 091 * 092 * @since 1.3 093 */ 094public class StringTokenizer implements ListIterator<String>, Cloneable { 095 096 /** Comma separated values tokenizer internal variable. */ 097 // @formatter:off 098 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer() 099 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher()) 100 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()) 101 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()) 102 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()) 103 .setEmptyTokenAsNull(false) 104 .setIgnoreEmptyTokens(false); 105 // @formatter:on 106 107 /** Tab separated values tokenizer internal variable. */ 108 // @formatter:off 109 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer() 110 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher()) 111 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()) 112 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()) 113 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()) 114 .setEmptyTokenAsNull(false) 115 .setIgnoreEmptyTokens(false); 116 // @formatter:on 117 118 /** 119 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 120 * 121 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 122 */ 123 private static StringTokenizer getCSVClone() { 124 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 125 } 126 127 /** 128 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 129 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 130 * setTrimmer method). 131 * <p> 132 * You must call a "reset" method to set the string which you want to parse. 133 * </p> 134 * 135 * @return a new tokenizer instance which parses Comma Separated Value strings. 136 */ 137 public static StringTokenizer getCSVInstance() { 138 return getCSVClone(); 139 } 140 141 /** 142 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 143 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 144 * 145 * @param input the text to parse. 146 * @return a new tokenizer instance which parses Comma Separated Value strings. 147 */ 148 public static StringTokenizer getCSVInstance(final char[] input) { 149 return getCSVClone().reset(input); 150 } 151 152 /** 153 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 154 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 155 * 156 * @param input the text to parse. 157 * @return a new tokenizer instance which parses Comma Separated Value strings. 158 */ 159 public static StringTokenizer getCSVInstance(final String input) { 160 return getCSVClone().reset(input); 161 } 162 163 /** 164 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 165 * 166 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 167 */ 168 private static StringTokenizer getTSVClone() { 169 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 170 } 171 172 /** 173 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 174 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 175 * <p> 176 * You must call a "reset" method to set the string which you want to parse. 177 * </p> 178 * 179 * @return a new tokenizer instance which parses Tab Separated Value strings. 180 */ 181 public static StringTokenizer getTSVInstance() { 182 return getTSVClone(); 183 } 184 185 /** 186 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 187 * be overridden with the setTrimmer method). 188 * 189 * @param input the string to parse. 190 * @return a new tokenizer instance which parses Tab Separated Value strings. 191 */ 192 public static StringTokenizer getTSVInstance(final char[] input) { 193 return getTSVClone().reset(input); 194 } 195 196 /** 197 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 198 * be overridden with the setTrimmer method). 199 * 200 * @param input the string to parse. 201 * @return a new tokenizer instance which parses Tab Separated Value strings. 202 */ 203 public static StringTokenizer getTSVInstance(final String input) { 204 return getTSVClone().reset(input); 205 } 206 207 /** The text to work on. */ 208 private char[] chars; 209 210 /** The parsed tokens. */ 211 private String[] tokens; 212 213 /** The current iteration position. */ 214 private int tokenPos; 215 216 /** The delimiter matcher. */ 217 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher(); 218 219 /** The quote matcher. */ 220 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 221 222 /** The ignored matcher. */ 223 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 224 225 /** The trimmer matcher. */ 226 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 227 228 /** Whether to return empty tokens as null. */ 229 private boolean emptyAsNull; 230 231 /** Whether to ignore empty tokens. */ 232 private boolean ignoreEmptyTokens = true; 233 234 /** 235 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize. 236 * <p> 237 * This constructor is normally used with {@link #reset(String)}. 238 * </p> 239 */ 240 public StringTokenizer() { 241 this.chars = null; 242 } 243 244 /** 245 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 246 * 247 * @param input the string which is to be parsed, not cloned. 248 */ 249 public StringTokenizer(final char[] input) { 250 this.chars = input != null ? input.clone() : null; 251 } 252 253 /** 254 * Constructs a tokenizer splitting on the specified character. 255 * 256 * @param input the string which is to be parsed, not cloned. 257 * @param delim the field delimiter character. 258 */ 259 public StringTokenizer(final char[] input, final char delim) { 260 this(input); 261 setDelimiterChar(delim); 262 } 263 264 /** 265 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character. 266 * 267 * @param input the string which is to be parsed, not cloned. 268 * @param delim the field delimiter character. 269 * @param quote the field quoted string character. 270 */ 271 public StringTokenizer(final char[] input, final char delim, final char quote) { 272 this(input, delim); 273 setQuoteChar(quote); 274 } 275 276 /** 277 * Constructs a tokenizer splitting on the specified string. 278 * 279 * @param input the string which is to be parsed, not cloned. 280 * @param delim the field delimiter string. 281 */ 282 public StringTokenizer(final char[] input, final String delim) { 283 this(input); 284 setDelimiterString(delim); 285 } 286 287 /** 288 * Constructs a tokenizer splitting using the specified delimiter matcher. 289 * 290 * @param input the string which is to be parsed, not cloned. 291 * @param delim the field delimiter matcher. 292 */ 293 public StringTokenizer(final char[] input, final StringMatcher delim) { 294 this(input); 295 setDelimiterMatcher(delim); 296 } 297 298 /** 299 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher. 300 * 301 * @param input the string which is to be parsed, not cloned. 302 * @param delim the field delimiter character. 303 * @param quote the field quoted string character. 304 */ 305 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) { 306 this(input, delim); 307 setQuoteMatcher(quote); 308 } 309 310 /** 311 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 312 * 313 * @param input the string which is to be parsed. 314 */ 315 public StringTokenizer(final String input) { 316 this.chars = input != null ? input.toCharArray() : null; 317 } 318 319 /** 320 * Constructs a tokenizer splitting on the specified delimiter character. 321 * 322 * @param input the string which is to be parsed. 323 * @param delim the field delimiter character. 324 */ 325 public StringTokenizer(final String input, final char delim) { 326 this(input); 327 setDelimiterChar(delim); 328 } 329 330 /** 331 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified quote character. 332 * 333 * @param input the string which is to be parsed. 334 * @param delim the field delimiter character. 335 * @param quote the field quoted string character. 336 */ 337 public StringTokenizer(final String input, final char delim, final char quote) { 338 this(input, delim); 339 setQuoteChar(quote); 340 } 341 342 /** 343 * Constructs a tokenizer splitting on the specified delimiter string. 344 * 345 * @param input the string which is to be parsed. 346 * @param delim the field delimiter string. 347 */ 348 public StringTokenizer(final String input, final String delim) { 349 this(input); 350 setDelimiterString(delim); 351 } 352 353 /** 354 * Constructs a tokenizer splitting using the specified delimiter matcher. 355 * 356 * @param input the string which is to be parsed. 357 * @param delim the field delimiter matcher. 358 */ 359 public StringTokenizer(final String input, final StringMatcher delim) { 360 this(input); 361 setDelimiterMatcher(delim); 362 } 363 364 /** 365 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified quote matcher. 366 * 367 * @param input the string which is to be parsed. 368 * @param delim the field delimiter matcher. 369 * @param quote the field quoted string matcher. 370 */ 371 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) { 372 this(input, delim); 373 setQuoteMatcher(quote); 374 } 375 376 /** 377 * Unsupported ListIterator operation. 378 * 379 * @param obj this parameter ignored. 380 * @throws UnsupportedOperationException always. 381 */ 382 @Override 383 public void add(final String obj) { 384 throw new UnsupportedOperationException("add() is unsupported"); 385 } 386 387 /** 388 * Adds a token to a list, paying attention to the parameters we've set. 389 * 390 * @param list the list to add to. 391 * @param tok the token to add. 392 */ 393 private void addToken(final List<String> list, String tok) { 394 if (tok == null || tok.isEmpty()) { 395 if (isIgnoreEmptyTokens()) { 396 return; 397 } 398 if (isEmptyTokenAsNull()) { 399 tok = null; 400 } 401 } 402 list.add(tok); 403 } 404 405 /** 406 * Checks if tokenization has been done, and if not then do it. 407 */ 408 private void checkTokenized() { 409 if (tokens == null) { 410 final List<String> split; 411 if (chars == null) { 412 // still call tokenize as subclass may do some work. 413 split = tokenize(null, 0, 0); 414 } else { 415 split = tokenize(chars, 0, chars.length); 416 } 417 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 418 } 419 } 420 421 /** 422 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a 423 * {@link CloneNotSupportedException} is caught, return {@code null}. 424 * 425 * @return a new instance of this Tokenizer which has been reset. 426 */ 427 @Override 428 public Object clone() { 429 try { 430 return cloneReset(); 431 } catch (final CloneNotSupportedException ex) { 432 return null; 433 } 434 } 435 436 /** 437 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. 438 * 439 * @return a new instance of this Tokenizer which has been reset. 440 * @throws CloneNotSupportedException if there is a problem cloning. 441 */ 442 Object cloneReset() throws CloneNotSupportedException { 443 // this method exists to enable 100% test coverage 444 final StringTokenizer cloned = (StringTokenizer) super.clone(); 445 if (cloned.chars != null) { 446 cloned.chars = cloned.chars.clone(); 447 } 448 cloned.reset(); 449 return cloned; 450 } 451 452 /** 453 * Gets the String content that the tokenizer is parsing. 454 * 455 * @return The string content being parsed. 456 */ 457 public String getContent() { 458 if (chars == null) { 459 return null; 460 } 461 return new String(chars); 462 } 463 464 /** 465 * Gets the field delimiter matcher. 466 * 467 * @return The delimiter matcher in use. 468 */ 469 public StringMatcher getDelimiterMatcher() { 470 return this.delimMatcher; 471 } 472 473 /** 474 * Gets the ignored character matcher. 475 * <p> 476 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything. 477 * </p> 478 * 479 * @return The ignored matcher in use. 480 */ 481 public StringMatcher getIgnoredMatcher() { 482 return ignoredMatcher; 483 } 484 485 /** 486 * Gets the quote matcher currently in use. 487 * <p> 488 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote). 489 * </p> 490 * 491 * @return The quote matcher in use. 492 */ 493 public StringMatcher getQuoteMatcher() { 494 return quoteMatcher; 495 } 496 497 /** 498 * Gets a copy of the full token list as an independent modifiable array. 499 * 500 * @return The tokens as a String array. 501 */ 502 public String[] getTokenArray() { 503 checkTokenized(); 504 return tokens.clone(); 505 } 506 507 /** 508 * Gets a copy of the full token list as an independent modifiable list. 509 * 510 * @return The tokens as a String list. 511 */ 512 public List<String> getTokenList() { 513 checkTokenized(); 514 return new ArrayList<>(Arrays.asList(tokens)); 515 } 516 517 /** 518 * Gets the trimmer character matcher. 519 * <p> 520 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything. 521 * </p> 522 * 523 * @return The trimmer matcher in use. 524 */ 525 public StringMatcher getTrimmerMatcher() { 526 return trimmerMatcher; 527 } 528 529 /** 530 * Tests whether there are any more tokens. 531 * 532 * @return true if there are more tokens. 533 */ 534 @Override 535 public boolean hasNext() { 536 checkTokenized(); 537 return tokenPos < tokens.length; 538 } 539 540 /** 541 * Tests whether there are any previous tokens that can be iterated to. 542 * 543 * @return true if there are previous tokens. 544 */ 545 @Override 546 public boolean hasPrevious() { 547 checkTokenized(); 548 return tokenPos > 0; 549 } 550 551 /** 552 * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false. 553 * 554 * @return true if empty tokens are returned as null. 555 */ 556 public boolean isEmptyTokenAsNull() { 557 return this.emptyAsNull; 558 } 559 560 /** 561 * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true. 562 * 563 * @return true if empty tokens are not returned. 564 */ 565 public boolean isIgnoreEmptyTokens() { 566 return ignoreEmptyTokens; 567 } 568 569 /** 570 * Tests if the characters at the index specified match the quote already matched in readNextToken(). 571 * 572 * @param srcChars the character array being tokenized. 573 * @param pos the position to check for a quote. 574 * @param len the length of the character array being tokenized. 575 * @param quoteStart the start position of the matched quote, 0 if no quoting. 576 * @param quoteLen the length of the matched quote, 0 if no quoting. 577 * @return true if a quote is matched. 578 */ 579 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 580 for (int i = 0; i < quoteLen; i++) { 581 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 582 return false; 583 } 584 } 585 return true; 586 } 587 588 /** 589 * Gets the next token. 590 * 591 * @return The next String token. 592 * @throws NoSuchElementException if there are no more elements. 593 */ 594 @Override 595 public String next() { 596 if (hasNext()) { 597 return tokens[tokenPos++]; 598 } 599 throw new NoSuchElementException(); 600 } 601 602 /** 603 * Gets the index of the next token to return. 604 * 605 * @return The next token index. 606 */ 607 @Override 608 public int nextIndex() { 609 return tokenPos; 610 } 611 612 /** 613 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no 614 * tokens remain. 615 * 616 * @return The next sequential token, or null when no more tokens are found. 617 */ 618 public String nextToken() { 619 if (hasNext()) { 620 return tokens[tokenPos++]; 621 } 622 return null; 623 } 624 625 /** 626 * Gets the token previous to the last returned token. 627 * 628 * @return The previous token. 629 */ 630 @Override 631 public String previous() { 632 if (hasPrevious()) { 633 return tokens[--tokenPos]; 634 } 635 throw new NoSuchElementException(); 636 } 637 638 /** 639 * Gets the index of the previous token. 640 * 641 * @return The previous token index. 642 */ 643 @Override 644 public int previousIndex() { 645 return tokenPos - 1; 646 } 647 648 /** 649 * Gets the previous token from the String. 650 * 651 * @return The previous sequential token, or null when no more tokens are found. 652 */ 653 public String previousToken() { 654 if (hasPrevious()) { 655 return tokens[--tokenPos]; 656 } 657 return null; 658 } 659 660 /** 661 * Reads character by character through the String to get the next token. 662 * 663 * @param srcChars the character array being tokenized. 664 * @param start the first character of field. 665 * @param len the length of the character array being tokenized. 666 * @param workArea a temporary work area. 667 * @param tokenList the list of parsed tokens. 668 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found. 669 */ 670 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea, 671 final List<String> tokenList) { 672 // skip all leading whitespace, unless it is the 673 // field delimiter or the quote character 674 while (start < len) { 675 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len), 676 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 677 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 678 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 679 break; 680 } 681 start += removeLen; 682 } 683 684 // handle reaching end 685 if (start >= len) { 686 addToken(tokenList, StringUtils.EMPTY); 687 return -1; 688 } 689 690 // handle empty token 691 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 692 if (delimLen > 0) { 693 addToken(tokenList, StringUtils.EMPTY); 694 return start + delimLen; 695 } 696 697 // handle found token 698 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 699 if (quoteLen > 0) { 700 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 701 } 702 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 703 } 704 705 /** 706 * Reads a possibly quoted string token. 707 * 708 * @param srcChars the character array being tokenized. 709 * @param start the first character of field. 710 * @param len the length of the character array being tokenized. 711 * @param workArea a temporary work area. 712 * @param tokenList the list of parsed tokens. 713 * @param quoteStart the start position of the matched quote, 0 if no quoting. 714 * @param quoteLen the length of the matched quote, 0 if no quoting. 715 * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string. 716 */ 717 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea, 718 final List<String> tokenList, final int quoteStart, final int quoteLen) { 719 // Loop until we've found the end of the quoted 720 // string or the end of the input 721 workArea.clear(); 722 int pos = start; 723 boolean quoting = quoteLen > 0; 724 int trimStart = 0; 725 726 while (pos < len) { 727 // quoting mode can occur several times throughout a string 728 // we must switch between quoting and non-quoting until we 729 // encounter a non-quoted delimiter, or end of string 730 if (quoting) { 731 // In quoting mode 732 733 // If we've found a quote character, see if it's 734 // followed by a second quote. If so, then we need 735 // to actually put the quote character into the token 736 // rather than end the token. 737 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 738 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 739 // matched pair of quotes, thus an escaped quote 740 workArea.append(srcChars, pos, quoteLen); 741 pos += quoteLen * 2; 742 trimStart = workArea.size(); 743 continue; 744 } 745 746 // end of quoting 747 quoting = false; 748 pos += quoteLen; 749 continue; 750 } 751 752 } else { 753 // Not in quoting mode 754 755 // check for delimiter, and thus end of token 756 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 757 if (delimLen > 0) { 758 // return condition when end of token found 759 addToken(tokenList, workArea.substring(0, trimStart)); 760 return pos + delimLen; 761 } 762 763 // check for quote, and thus back into quoting mode 764 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 765 quoting = true; 766 pos += quoteLen; 767 continue; 768 } 769 770 // check for ignored (outside quotes), and ignore 771 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 772 if (ignoredLen > 0) { 773 pos += ignoredLen; 774 continue; 775 } 776 777 // check for trimmed character 778 // don't yet know if its at the end, so copy to workArea 779 // use trimStart to keep track of trim at the end 780 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 781 if (trimmedLen > 0) { 782 workArea.append(srcChars, pos, trimmedLen); 783 pos += trimmedLen; 784 continue; 785 } 786 } 787 // copy regular character from inside quotes 788 workArea.append(srcChars[pos++]); 789 trimStart = workArea.size(); 790 } 791 792 // return condition when end of string found 793 addToken(tokenList, workArea.substring(0, trimStart)); 794 return -1; 795 } 796 797 /** 798 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation. 799 * 800 * @throws UnsupportedOperationException Always thrown. 801 */ 802 @Override 803 public void remove() { 804 throw new UnsupportedOperationException("remove() is unsupported"); 805 } 806 807 /** 808 * Resets this tokenizer, forgetting all parsing and iteration already completed. 809 * <p> 810 * This method allows the same tokenizer to be reused for the same String. 811 * </p> 812 * 813 * @return {@code this} instance. 814 */ 815 public StringTokenizer reset() { 816 tokenPos = 0; 817 tokens = null; 818 return this; 819 } 820 821 /** 822 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines. 823 * 824 * @param input the new character array to tokenize, not cloned, null sets no text to parse. 825 * @return {@code this} instance. 826 */ 827 public StringTokenizer reset(final char[] input) { 828 reset(); 829 this.chars = input != null ? input.clone() : null; 830 return this; 831 } 832 833 /** 834 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines. 835 * 836 * @param input the new string to tokenize, null sets no text to parse. 837 * @return {@code this} instance. 838 */ 839 public StringTokenizer reset(final String input) { 840 reset(); 841 this.chars = input != null ? input.toCharArray() : null; 842 return this; 843 } 844 845 /** 846 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation. 847 * 848 * @param obj this parameter ignored. 849 * @throws UnsupportedOperationException always. 850 */ 851 @Override 852 public void set(final String obj) { 853 throw new UnsupportedOperationException("set() is unsupported"); 854 } 855 856 /** 857 * Sets the field delimiter character. 858 * 859 * @param delim the delimiter character to use. 860 * @return {@code this} instance. 861 */ 862 public StringTokenizer setDelimiterChar(final char delim) { 863 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim)); 864 } 865 866 /** 867 * Sets the field delimiter matcher. 868 * <p> 869 * The delimiter is used to separate one token from another. 870 * </p> 871 * 872 * @param delim the delimiter matcher to use. 873 * @return {@code this} instance. 874 */ 875 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) { 876 this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim; 877 return this; 878 } 879 880 /** 881 * Sets the field delimiter string. 882 * 883 * @param delim the delimiter string to use. 884 * @return {@code this} instance. 885 */ 886 public StringTokenizer setDelimiterString(final String delim) { 887 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim)); 888 } 889 890 /** 891 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 892 * 893 * @param emptyAsNull whether empty tokens are returned as null. 894 * @return {@code this} instance. 895 */ 896 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 897 this.emptyAsNull = emptyAsNull; 898 return this; 899 } 900 901 /** 902 * Sets the character to ignore. 903 * <p> 904 * This character is ignored when parsing the String, unless it is within a quoted region. 905 * </p> 906 * 907 * @param ignored the ignored character to use. 908 * @return {@code this} instance. 909 */ 910 public StringTokenizer setIgnoredChar(final char ignored) { 911 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored)); 912 } 913 914 /** 915 * Sets the matcher for characters to ignore. 916 * <p> 917 * These characters are ignored when parsing the String, unless they are within a quoted region. 918 * </p> 919 * 920 * @param ignored the ignored matcher to use, null ignored. 921 * @return {@code this} instance. 922 */ 923 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) { 924 if (ignored != null) { 925 this.ignoredMatcher = ignored; 926 } 927 return this; 928 } 929 930 /** 931 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 932 * 933 * @param ignoreEmptyTokens whether empty tokens are not returned. 934 * @return {@code this} instance. 935 */ 936 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 937 this.ignoreEmptyTokens = ignoreEmptyTokens; 938 return this; 939 } 940 941 /** 942 * Sets the quote character to use. 943 * <p> 944 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 945 * </p> 946 * 947 * @param quote the quote character to use. 948 * @return {@code this} instance. 949 */ 950 public StringTokenizer setQuoteChar(final char quote) { 951 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote)); 952 } 953 954 /** 955 * Sets the quote matcher to use. 956 * <p> 957 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 958 * </p> 959 * 960 * @param quote the quote matcher to use, null ignored. 961 * @return {@code this} instance. 962 */ 963 public StringTokenizer setQuoteMatcher(final StringMatcher quote) { 964 if (quote != null) { 965 this.quoteMatcher = quote; 966 } 967 return this; 968 } 969 970 /** 971 * Sets the matcher for characters to trim. 972 * <p> 973 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 974 * 975 * @param trimmer the trimmer matcher to use, null ignored. 976 * @return {@code this} instance. 977 */ 978 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) { 979 if (trimmer != null) { 980 this.trimmerMatcher = trimmer; 981 } 982 return this; 983 } 984 985 /** 986 * Gets the number of tokens found in the String. 987 * 988 * @return The number of matched tokens. 989 */ 990 public int size() { 991 checkTokenized(); 992 return tokens.length; 993 } 994 995 /** 996 * Internal method to performs the tokenization. 997 * <p> 998 * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required. 999 * </p> 1000 * <p> 1001 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or 1002 * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results. 1003 * </p> 1004 * <p> 1005 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other 1006 * values, or even an entirely different array. 1007 * </p> 1008 * 1009 * @param srcChars the character array being tokenized, may be null. 1010 * @param offset the start position within the character array, must be valid. 1011 * @param count the number of characters to tokenize, must be valid. 1012 * @return The modifiable list of String tokens, unmodifiable if null array or zero count. 1013 */ 1014 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1015 if (srcChars == null || count == 0) { 1016 return Collections.emptyList(); 1017 } 1018 final TextStringBuilder buf = new TextStringBuilder(); 1019 final List<String> tokenList = new ArrayList<>(); 1020 int pos = offset; 1021 // loop around the entire buffer 1022 while (pos >= 0 && pos < count) { 1023 // find next token 1024 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1025 // handle case where end of string is a delimiter 1026 if (pos >= count) { 1027 addToken(tokenList, StringUtils.EMPTY); 1028 } 1029 } 1030 return tokenList; 1031 } 1032 1033 /** 1034 * Gets the String content that the tokenizer is parsing. 1035 * 1036 * @return The string content being parsed. 1037 */ 1038 @Override 1039 public String toString() { 1040 if (tokens == null) { 1041 return "StringTokenizer[not tokenized yet]"; 1042 } 1043 return "StringTokenizer" + getTokenList(); 1044 } 1045}