001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025import java.util.StringTokenizer; 026 027import org.apache.commons.lang3.ArrayUtils; 028import org.apache.commons.lang3.StringUtils; 029 030/** 031 * Tokenizes a string based on delimiters (separators) 032 * and supporting quoting and ignored character concepts. 033 * <p> 034 * This class can split a String into many smaller strings. It aims 035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 036 * however it offers much more control and flexibility including implementing 037 * the {@link ListIterator} interface. By default, it is set up 038 * like {@link StringTokenizer}. 039 * </p> 040 * <p> 041 * The input String is split into a number of <em>tokens</em>. 042 * Each token is separated from the next String by a <em>delimiter</em>. 043 * One or more delimiter characters must be specified. 044 * </p> 045 * <p> 046 * Each token may be surrounded by quotes. 047 * The <em>quote</em> matcher specifies the quote character(s). 048 * A quote may be escaped within a quoted section by duplicating itself. 049 * </p> 050 * <p> 051 * Between each token and the delimiter are potentially characters that need trimming. 052 * The <em>trimmer</em> matcher specifies these characters. 053 * One usage might be to trim whitespace characters. 054 * </p> 055 * <p> 056 * At any point outside the quotes there might potentially be invalid characters. 057 * The <em>ignored</em> matcher specifies these characters to be removed. 058 * One usage might be to remove new line characters. 059 * </p> 060 * <p> 061 * Empty tokens may be removed or returned as null. 062 * </p> 063 * <pre> 064 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 065 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 067 * </pre> 068 * 069 * <table> 070 * <caption>StrTokenizer properties and options</caption> 071 * <tr> 072 * <th>Property</th><th>Type</th><th>Default</th> 073 * </tr> 074 * <tr> 075 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 076 * </tr> 077 * <tr> 078 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 079 * </tr> 080 * <tr> 081 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 082 * </tr> 083 * <tr> 084 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 085 * </tr> 086 * <tr> 087 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 088 * </tr> 089 * </table> 090 * 091 * @since 2.2 092 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text 093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 094 * StringTokenizer</a>. 095 */ 096@Deprecated 097public class StrTokenizer implements ListIterator<String>, Cloneable { 098 099 // @formatter:off 100 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 101 .setDelimiterMatcher(StrMatcher.commaMatcher()) 102 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 103 .setIgnoredMatcher(StrMatcher.noneMatcher()) 104 .setTrimmerMatcher(StrMatcher.trimMatcher()) 105 .setEmptyTokenAsNull(false) 106 .setIgnoreEmptyTokens(false); 107 // @formatter:on 108 109 // @formatter:off 110 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 111 .setDelimiterMatcher(StrMatcher.tabMatcher()) 112 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 113 .setIgnoredMatcher(StrMatcher.noneMatcher()) 114 .setTrimmerMatcher(StrMatcher.trimMatcher()) 115 .setEmptyTokenAsNull(false) 116 .setIgnoreEmptyTokens(false); 117 // @formatter:on 118 119 /** 120 * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 121 * 122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 123 */ 124 private static StrTokenizer getCSVClone() { 125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 126 } 127 /** 128 * Gets a new tokenizer instance which parses Comma Separated Value strings 129 * initializing it with the given input. The default for CSV processing 130 * will be trim whitespace from both ends (which can be overridden with 131 * the setTrimmer method). 132 * <p> 133 * You must call a "reset" method to set the string which you want to parse. 134 * </p> 135 * @return a new tokenizer instance which parses Comma Separated Value strings. 136 */ 137 public static StrTokenizer getCSVInstance() { 138 return getCSVClone(); 139 } 140 /** 141 * Gets a new tokenizer instance which parses Comma Separated Value strings 142 * initializing it with the given input. The default for CSV processing 143 * will be trim whitespace from both ends (which can be overridden with 144 * the setTrimmer method). 145 * 146 * @param input the text to parse. 147 * @return a new tokenizer instance which parses Comma Separated Value strings. 148 */ 149 public static StrTokenizer getCSVInstance(final char[] input) { 150 final StrTokenizer tok = getCSVClone(); 151 tok.reset(input); 152 return tok; 153 } 154 155 /** 156 * Gets a new tokenizer instance which parses Comma Separated Value strings 157 * initializing it with the given input. The default for CSV processing 158 * will be trim whitespace from both ends (which can be overridden with 159 * the setTrimmer method). 160 * 161 * @param input the text to parse. 162 * @return a new tokenizer instance which parses Comma Separated Value strings. 163 */ 164 public static StrTokenizer getCSVInstance(final String input) { 165 final StrTokenizer tok = getCSVClone(); 166 tok.reset(input); 167 return tok; 168 } 169 /** 170 * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 171 * 172 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 173 */ 174 private static StrTokenizer getTSVClone() { 175 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 176 } 177 178 /** 179 * Gets a new tokenizer instance which parses Tab Separated Value strings. 180 * The default for CSV processing will be trim whitespace from both ends 181 * (which can be overridden with the setTrimmer method). 182 * <p> 183 * You must call a "reset" method to set the string which you want to parse. 184 * </p> 185 * @return a new tokenizer instance which parses Tab Separated Value strings. 186 */ 187 public static StrTokenizer getTSVInstance() { 188 return getTSVClone(); 189 } 190 191 /** 192 * Gets a new tokenizer instance which parses Tab Separated Value strings. 193 * The default for CSV processing will be trim whitespace from both ends 194 * (which can be overridden with the setTrimmer method). 195 * 196 * @param input the string to parse. 197 * @return a new tokenizer instance which parses Tab Separated Value strings. 198 */ 199 public static StrTokenizer getTSVInstance(final char[] input) { 200 final StrTokenizer tok = getTSVClone(); 201 tok.reset(input); 202 return tok; 203 } 204 205 /** 206 * Gets a new tokenizer instance which parses Tab Separated Value strings. 207 * The default for CSV processing will be trim whitespace from both ends 208 * (which can be overridden with the setTrimmer method). 209 * 210 * @param input the string to parse. 211 * @return a new tokenizer instance which parses Tab Separated Value strings. 212 */ 213 public static StrTokenizer getTSVInstance(final String input) { 214 final StrTokenizer tok = getTSVClone(); 215 tok.reset(input); 216 return tok; 217 } 218 /** The text to work on. */ 219 private char[] chars; 220 221 /** The parsed tokens */ 222 private String[] tokens; 223 224 /** The current iteration position */ 225 private int tokenPos; 226 227 /** The delimiter matcher */ 228 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 229 230 /** The quote matcher */ 231 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 232 233 /** The ignored matcher */ 234 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 235 236 /** The trimmer matcher */ 237 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 238 239 /** Whether to return empty tokens as null */ 240 private boolean emptyAsNull; 241 242 /** Whether to ignore empty tokens */ 243 private boolean ignoreEmptyTokens = true; 244 245 /** 246 * Constructs a tokenizer splitting on space, tab, newline and formfeed 247 * as per StringTokenizer, but with no text to tokenize. 248 * <p> 249 * This constructor is normally used with {@link #reset(String)}. 250 * </p> 251 */ 252 public StrTokenizer() { 253 this.chars = null; 254 } 255 256 /** 257 * Constructs a tokenizer splitting on space, tab, newline and formfeed 258 * as per StringTokenizer. 259 * 260 * @param input the string which is to be parsed, not cloned. 261 */ 262 public StrTokenizer(final char[] input) { 263 this.chars = ArrayUtils.clone(input); 264 } 265 266 /** 267 * Constructs a tokenizer splitting on the specified character. 268 * 269 * @param input the string which is to be parsed, not cloned. 270 * @param delim the field delimiter character. 271 */ 272 public StrTokenizer(final char[] input, final char delim) { 273 this(input); 274 setDelimiterChar(delim); 275 } 276 277 /** 278 * Constructs a tokenizer splitting on the specified delimiter character 279 * and handling quotes using the specified quote character. 280 * 281 * @param input the string which is to be parsed, not cloned. 282 * @param delim the field delimiter character. 283 * @param quote the field quoted string character. 284 */ 285 public StrTokenizer(final char[] input, final char delim, final char quote) { 286 this(input, delim); 287 setQuoteChar(quote); 288 } 289 290 /** 291 * Constructs a tokenizer splitting on the specified string. 292 * 293 * @param input the string which is to be parsed, not cloned. 294 * @param delim the field delimiter string. 295 */ 296 public StrTokenizer(final char[] input, final String delim) { 297 this(input); 298 setDelimiterString(delim); 299 } 300 301 /** 302 * Constructs a tokenizer splitting using the specified delimiter matcher. 303 * 304 * @param input the string which is to be parsed, not cloned. 305 * @param delim the field delimiter matcher. 306 */ 307 public StrTokenizer(final char[] input, final StrMatcher delim) { 308 this(input); 309 setDelimiterMatcher(delim); 310 } 311 312 /** 313 * Constructs a tokenizer splitting using the specified delimiter matcher 314 * and handling quotes using the specified quote matcher. 315 * 316 * @param input the string which is to be parsed, not cloned. 317 * @param delim the field delimiter character. 318 * @param quote the field quoted string character. 319 */ 320 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 321 this(input, delim); 322 setQuoteMatcher(quote); 323 } 324 325 /** 326 * Constructs a tokenizer splitting on space, tab, newline and formfeed 327 * as per StringTokenizer. 328 * 329 * @param input the string which is to be parsed. 330 */ 331 public StrTokenizer(final String input) { 332 if (input != null) { 333 chars = input.toCharArray(); 334 } else { 335 chars = null; 336 } 337 } 338 339 /** 340 * Constructs a tokenizer splitting on the specified delimiter character. 341 * 342 * @param input the string which is to be parsed. 343 * @param delim the field delimiter character. 344 */ 345 public StrTokenizer(final String input, final char delim) { 346 this(input); 347 setDelimiterChar(delim); 348 } 349 350 /** 351 * Constructs a tokenizer splitting on the specified delimiter character 352 * and handling quotes using the specified quote character. 353 * 354 * @param input the string which is to be parsed. 355 * @param delim the field delimiter character. 356 * @param quote the field quoted string character. 357 */ 358 public StrTokenizer(final String input, final char delim, final char quote) { 359 this(input, delim); 360 setQuoteChar(quote); 361 } 362 363 /** 364 * Constructs a tokenizer splitting on the specified delimiter string. 365 * 366 * @param input the string which is to be parsed. 367 * @param delim the field delimiter string. 368 */ 369 public StrTokenizer(final String input, final String delim) { 370 this(input); 371 setDelimiterString(delim); 372 } 373 374 /** 375 * Constructs a tokenizer splitting using the specified delimiter matcher. 376 * 377 * @param input the string which is to be parsed. 378 * @param delim the field delimiter matcher. 379 */ 380 public StrTokenizer(final String input, final StrMatcher delim) { 381 this(input); 382 setDelimiterMatcher(delim); 383 } 384 385 /** 386 * Constructs a tokenizer splitting using the specified delimiter matcher 387 * and handling quotes using the specified quote matcher. 388 * 389 * @param input the string which is to be parsed. 390 * @param delim the field delimiter matcher. 391 * @param quote the field quoted string matcher. 392 */ 393 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 394 this(input, delim); 395 setQuoteMatcher(quote); 396 } 397 398 /** 399 * Unsupported ListIterator operation. 400 * 401 * @param obj this parameter ignored. 402 * @throws UnsupportedOperationException always. 403 */ 404 @Override 405 public void add(final String obj) { 406 throw new UnsupportedOperationException("add() is unsupported"); 407 } 408 409 /** 410 * Adds a token to a list, paying attention to the parameters we've set. 411 * 412 * @param list the list to add to. 413 * @param tok the token to add. 414 */ 415 private void addToken(final List<String> list, String tok) { 416 if (StringUtils.isEmpty(tok)) { 417 if (isIgnoreEmptyTokens()) { 418 return; 419 } 420 if (isEmptyTokenAsNull()) { 421 tok = null; 422 } 423 } 424 list.add(tok); 425 } 426 427 /** 428 * Checks if tokenization has been done, and if not then do it. 429 */ 430 private void checkTokenized() { 431 if (tokens == null) { 432 if (chars == null) { 433 // still call tokenize as subclass may do some work 434 final List<String> split = tokenize(null, 0, 0); 435 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 436 } else { 437 final List<String> split = tokenize(chars, 0, chars.length); 438 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 439 } 440 } 441 } 442 443 /** 444 * Creates a new instance of this Tokenizer. The new instance is reset so 445 * that it will be at the start of the token list. 446 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 447 * 448 * @return a new instance of this Tokenizer which has been reset. 449 */ 450 @Override 451 public Object clone() { 452 try { 453 return cloneReset(); 454 } catch (final CloneNotSupportedException ex) { 455 return null; 456 } 457 } 458 459 /** 460 * Creates a new instance of this Tokenizer. The new instance is reset so that 461 * it will be at the start of the token list. 462 * 463 * @return a new instance of this Tokenizer which has been reset. 464 * @throws CloneNotSupportedException if there is a problem cloning. 465 */ 466 Object cloneReset() throws CloneNotSupportedException { 467 // this method exists to enable 100% test coverage 468 final StrTokenizer cloned = (StrTokenizer) super.clone(); 469 if (cloned.chars != null) { 470 cloned.chars = cloned.chars.clone(); 471 } 472 cloned.reset(); 473 return cloned; 474 } 475 476 /** 477 * Gets the String content that the tokenizer is parsing. 478 * 479 * @return the string content being parsed. 480 */ 481 public String getContent() { 482 if (chars == null) { 483 return null; 484 } 485 return new String(chars); 486 } 487 488 /** 489 * Gets the field delimiter matcher. 490 * 491 * @return the delimiter matcher in use. 492 */ 493 public StrMatcher getDelimiterMatcher() { 494 return this.delimMatcher; 495 } 496 497 // Ignored 498 /** 499 * Gets the ignored character matcher. 500 * <p> 501 * These characters are ignored when parsing the String, unless they are 502 * within a quoted region. 503 * The default value is not to ignore anything. 504 * </p> 505 * 506 * @return the ignored matcher in use. 507 */ 508 public StrMatcher getIgnoredMatcher() { 509 return ignoredMatcher; 510 } 511 512 /** 513 * Gets the quote matcher currently in use. 514 * <p> 515 * The quote character is used to wrap data between the tokens. 516 * This enables delimiters to be entered as data. 517 * The default value is '"' (double quote). 518 * </p> 519 * 520 * @return the quote matcher in use. 521 */ 522 public StrMatcher getQuoteMatcher() { 523 return quoteMatcher; 524 } 525 526 /** 527 * Gets a copy of the full token list as an independent modifiable array. 528 * 529 * @return the tokens as a String array. 530 */ 531 public String[] getTokenArray() { 532 checkTokenized(); 533 return tokens.clone(); 534 } 535 536 /** 537 * Gets a copy of the full token list as an independent modifiable list. 538 * 539 * @return the tokens as a String array. 540 */ 541 public List<String> getTokenList() { 542 checkTokenized(); 543 final List<String> list = new ArrayList<>(tokens.length); 544 list.addAll(Arrays.asList(tokens)); 545 return list; 546 } 547 548 /** 549 * Gets the trimmer character matcher. 550 * <p> 551 * These characters are trimmed off on each side of the delimiter 552 * until the token or quote is found. 553 * The default value is not to trim anything. 554 * </p> 555 * 556 * @return the trimmer matcher in use. 557 */ 558 public StrMatcher getTrimmerMatcher() { 559 return trimmerMatcher; 560 } 561 562 /** 563 * Checks whether there are any more tokens. 564 * 565 * @return true if there are more tokens. 566 */ 567 @Override 568 public boolean hasNext() { 569 checkTokenized(); 570 return tokenPos < tokens.length; 571 } 572 573 /** 574 * Checks whether there are any previous tokens that can be iterated to. 575 * 576 * @return true if there are previous tokens. 577 */ 578 @Override 579 public boolean hasPrevious() { 580 checkTokenized(); 581 return tokenPos > 0; 582 } 583 584 /** 585 * Gets whether the tokenizer currently returns empty tokens as null. 586 * The default for this property is false. 587 * 588 * @return true if empty tokens are returned as null. 589 */ 590 public boolean isEmptyTokenAsNull() { 591 return this.emptyAsNull; 592 } 593 594 /** 595 * Gets whether the tokenizer currently ignores empty tokens. 596 * The default for this property is true. 597 * 598 * @return true if empty tokens are not returned. 599 */ 600 public boolean isIgnoreEmptyTokens() { 601 return ignoreEmptyTokens; 602 } 603 604 /** 605 * Checks if the characters at the index specified match the quote 606 * already matched in readNextToken(). 607 * 608 * @param srcChars the character array being tokenized. 609 * @param pos the position to check for a quote. 610 * @param len the length of the character array being tokenized. 611 * @param quoteStart the start position of the matched quote, 0 if no quoting. 612 * @param quoteLen the length of the matched quote, 0 if no quoting. 613 * @return true if a quote is matched. 614 */ 615 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 616 for (int i = 0; i < quoteLen; i++) { 617 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 618 return false; 619 } 620 } 621 return true; 622 } 623 624 /** 625 * Gets the next token. 626 * 627 * @return the next String token. 628 * @throws NoSuchElementException if there are no more elements. 629 */ 630 @Override 631 public String next() { 632 if (hasNext()) { 633 return tokens[tokenPos++]; 634 } 635 throw new NoSuchElementException(); 636 } 637 638 /** 639 * Gets the index of the next token to return. 640 * 641 * @return the next token index. 642 */ 643 @Override 644 public int nextIndex() { 645 return tokenPos; 646 } 647 648 /** 649 * Gets the next token from the String. 650 * Equivalent to {@link #next()} except it returns null rather than 651 * throwing {@link NoSuchElementException} when no tokens remain. 652 * 653 * @return the next sequential token, or null when no more tokens are found. 654 */ 655 public String nextToken() { 656 if (hasNext()) { 657 return tokens[tokenPos++]; 658 } 659 return null; 660 } 661 662 /** 663 * Gets the token previous to the last returned token. 664 * 665 * @return the previous token. 666 */ 667 @Override 668 public String previous() { 669 if (hasPrevious()) { 670 return tokens[--tokenPos]; 671 } 672 throw new NoSuchElementException(); 673 } 674 675 /** 676 * Gets the index of the previous token. 677 * 678 * @return the previous token index. 679 */ 680 @Override 681 public int previousIndex() { 682 return tokenPos - 1; 683 } 684 685 /** 686 * Gets the previous token from the String. 687 * 688 * @return the previous sequential token, or null when no more tokens are found. 689 */ 690 public String previousToken() { 691 if (hasPrevious()) { 692 return tokens[--tokenPos]; 693 } 694 return null; 695 } 696 697 /** 698 * Reads character by character through the String to get the next token. 699 * 700 * @param srcChars the character array being tokenized. 701 * @param start the first character of field. 702 * @param len the length of the character array being tokenized. 703 * @param workArea a temporary work area. 704 * @param tokenList the list of parsed tokens. 705 * @return the starting position of the next field (the character 706 * immediately after the delimiter), or -1 if end of string found. 707 */ 708 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 709 // skip all leading whitespace, unless it is the 710 // field delimiter or the quote character 711 while (start < len) { 712 final int removeLen = Math.max( 713 getIgnoredMatcher().isMatch(srcChars, start, start, len), 714 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 715 if (removeLen == 0 || 716 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 717 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 718 break; 719 } 720 start += removeLen; 721 } 722 723 // handle reaching end 724 if (start >= len) { 725 addToken(tokenList, StringUtils.EMPTY); 726 return -1; 727 } 728 729 // handle empty token 730 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 731 if (delimLen > 0) { 732 addToken(tokenList, StringUtils.EMPTY); 733 return start + delimLen; 734 } 735 736 // handle found token 737 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 738 if (quoteLen > 0) { 739 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 740 } 741 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 742 } 743 744 /** 745 * Reads a possibly quoted string token. 746 * 747 * @param srcChars the character array being tokenized. 748 * @param start the first character of field. 749 * @param len the length of the character array being tokenized. 750 * @param workArea a temporary work area. 751 * @param tokenList the list of parsed tokens. 752 * @param quoteStart the start position of the matched quote, 0 if no quoting. 753 * @param quoteLen the length of the matched quote, 0 if no quoting. 754 * @return the starting position of the next field (the character 755 * immediately after the delimiter, or if end of string found, 756 * then the length of string. 757 */ 758 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 759 final List<String> tokenList, final int quoteStart, final int quoteLen) { 760 // Loop until we've found the end of the quoted 761 // string or the end of the input 762 workArea.clear(); 763 int pos = start; 764 boolean quoting = quoteLen > 0; 765 int trimStart = 0; 766 767 while (pos < len) { 768 // quoting mode can occur several times throughout a string 769 // we must switch between quoting and non-quoting until we 770 // encounter a non-quoted delimiter, or end of string 771 if (quoting) { 772 // In quoting mode 773 774 // If we've found a quote character, see if it's 775 // followed by a second quote. If so, then we need 776 // to actually put the quote character into the token 777 // rather than end the token. 778 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 779 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 780 // matched pair of quotes, thus an escaped quote 781 workArea.append(srcChars, pos, quoteLen); 782 pos += quoteLen * 2; 783 trimStart = workArea.size(); 784 continue; 785 } 786 787 // end of quoting 788 quoting = false; 789 pos += quoteLen; 790 continue; 791 } 792 793 } else { 794 // Not in quoting mode 795 796 // check for delimiter, and thus end of token 797 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 798 if (delimLen > 0) { 799 // return condition when end of token found 800 addToken(tokenList, workArea.substring(0, trimStart)); 801 return pos + delimLen; 802 } 803 804 // check for quote, and thus back into quoting mode 805 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 806 quoting = true; 807 pos += quoteLen; 808 continue; 809 } 810 811 // check for ignored (outside quotes), and ignore 812 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 813 if (ignoredLen > 0) { 814 pos += ignoredLen; 815 continue; 816 } 817 818 // check for trimmed character 819 // don't yet know if it's at the end, so copy to workArea 820 // use trimStart to keep track of trim at the end 821 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 822 if (trimmedLen > 0) { 823 workArea.append(srcChars, pos, trimmedLen); 824 pos += trimmedLen; 825 continue; 826 } 827 } 828 // copy regular character from inside quotes 829 workArea.append(srcChars[pos++]); 830 trimStart = workArea.size(); 831 } 832 833 // return condition when end of string found 834 addToken(tokenList, workArea.substring(0, trimStart)); 835 return -1; 836 } 837 838 /** 839 * Unsupported ListIterator operation. 840 * 841 * @throws UnsupportedOperationException always. 842 */ 843 @Override 844 public void remove() { 845 throw new UnsupportedOperationException("remove() is unsupported"); 846 } 847 848 /** 849 * Resets this tokenizer, forgetting all parsing and iteration already completed. 850 * <p> 851 * This method allows the same tokenizer to be reused for the same String. 852 * </p> 853 * 854 * @return {@code this} instance. 855 */ 856 public StrTokenizer reset() { 857 tokenPos = 0; 858 tokens = null; 859 return this; 860 } 861 862 /** 863 * Reset this tokenizer, giving it a new input string to parse. 864 * In this manner you can re-use a tokenizer with the same settings 865 * on multiple input lines. 866 * 867 * @param input the new character array to tokenize, not cloned, null sets no text to parse. 868 * @return {@code this} instance. 869 */ 870 public StrTokenizer reset(final char[] input) { 871 reset(); 872 this.chars = ArrayUtils.clone(input); 873 return this; 874 } 875 876 /** 877 * Reset this tokenizer, giving it a new input string to parse. 878 * In this manner you can re-use a tokenizer with the same settings 879 * on multiple input lines. 880 * 881 * @param input the new string to tokenize, null sets no text to parse. 882 * @return {@code this} instance. 883 */ 884 public StrTokenizer reset(final String input) { 885 reset(); 886 if (input != null) { 887 this.chars = input.toCharArray(); 888 } else { 889 this.chars = null; 890 } 891 return this; 892 } 893 894 /** 895 * Unsupported ListIterator operation. 896 * 897 * @param obj this parameter ignored. 898 * @throws UnsupportedOperationException always. 899 */ 900 @Override 901 public void set(final String obj) { 902 throw new UnsupportedOperationException("set() is unsupported"); 903 } 904 905 /** 906 * Sets the field delimiter character. 907 * 908 * @param delim the delimiter character to use. 909 * @return this, to enable chaining. 910 */ 911 public StrTokenizer setDelimiterChar(final char delim) { 912 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 913 } 914 915 /** 916 * Sets the field delimiter matcher. 917 * <p> 918 * The delimiter is used to separate one token from another. 919 * </p> 920 * 921 * @param delim the delimiter matcher to use. 922 * @return this, to enable chaining. 923 */ 924 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 925 if (delim == null) { 926 this.delimMatcher = StrMatcher.noneMatcher(); 927 } else { 928 this.delimMatcher = delim; 929 } 930 return this; 931 } 932 933 /** 934 * Sets the field delimiter string. 935 * 936 * @param delim the delimiter string to use. 937 * @return this, to enable chaining. 938 */ 939 public StrTokenizer setDelimiterString(final String delim) { 940 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 941 } 942 943 /** 944 * Sets whether the tokenizer should return empty tokens as null. 945 * The default for this property is false. 946 * 947 * @param emptyAsNull whether empty tokens are returned as null. 948 * @return this, to enable chaining. 949 */ 950 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 951 this.emptyAsNull = emptyAsNull; 952 return this; 953 } 954 955 /** 956 * Sets the character to ignore. 957 * <p> 958 * This character is ignored when parsing the String, unless it is 959 * within a quoted region. 960 * 961 * @param ignored the ignored character to use. 962 * @return this, to enable chaining. 963 */ 964 public StrTokenizer setIgnoredChar(final char ignored) { 965 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 966 } 967 968 /** 969 * Sets the matcher for characters to ignore. 970 * <p> 971 * These characters are ignored when parsing the String, unless they are 972 * within a quoted region. 973 * </p> 974 * 975 * @param ignored the ignored matcher to use, null ignored. 976 * @return {@code this} instance. 977 */ 978 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 979 if (ignored != null) { 980 this.ignoredMatcher = ignored; 981 } 982 return this; 983 } 984 985 /** 986 * Sets whether the tokenizer should ignore and not return empty tokens. 987 * The default for this property is true. 988 * 989 * @param ignoreEmptyTokens whether empty tokens are not returned. 990 * @return {@code this} instance. 991 */ 992 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 993 this.ignoreEmptyTokens = ignoreEmptyTokens; 994 return this; 995 } 996 997 /** 998 * Sets the quote character to use. 999 * <p> 1000 * The quote character is used to wrap data between the tokens. 1001 * This enables delimiters to be entered as data. 1002 * </p> 1003 * 1004 * @param quote the quote character to use. 1005 * @return {@code this} instance. 1006 */ 1007 public StrTokenizer setQuoteChar(final char quote) { 1008 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 1009 } 1010 1011 /** 1012 * Sets the quote matcher to use. 1013 * <p> 1014 * The quote character is used to wrap data between the tokens. 1015 * This enables delimiters to be entered as data. 1016 * </p> 1017 * 1018 * @param quote the quote matcher to use, null ignored. 1019 * @return {@code this} instance. 1020 */ 1021 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1022 if (quote != null) { 1023 this.quoteMatcher = quote; 1024 } 1025 return this; 1026 } 1027 1028 /** 1029 * Sets the matcher for characters to trim. 1030 * <p> 1031 * These characters are trimmed off on each side of the delimiter 1032 * until the token or quote is found. 1033 * </p> 1034 * 1035 * @param trimmer the trimmer matcher to use, null ignored. 1036 * @return {@code this} instance. 1037 */ 1038 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1039 if (trimmer != null) { 1040 this.trimmerMatcher = trimmer; 1041 } 1042 return this; 1043 } 1044 1045 // API 1046 /** 1047 * Gets the number of tokens found in the String. 1048 * 1049 * @return the number of matched tokens. 1050 */ 1051 public int size() { 1052 checkTokenized(); 1053 return tokens.length; 1054 } 1055 1056 /** 1057 * Internal method to performs the tokenization. 1058 * <p> 1059 * Most users of this class do not need to call this method. This method 1060 * will be called automatically by other (public) methods when required. 1061 * </p> 1062 * <p> 1063 * This method exists to allow subclasses to add code before or after the 1064 * tokenization. For example, a subclass could alter the character array, 1065 * offset or count to be parsed, or call the tokenizer multiple times on 1066 * multiple strings. It is also be possible to filter the results. 1067 * </p> 1068 * <p> 1069 * {@link StrTokenizer} will always pass a zero offset and a count 1070 * equal to the length of the array to this method, however a subclass 1071 * may pass other values, or even an entirely different array. 1072 * </p> 1073 * 1074 * @param srcChars the character array being tokenized, may be null. 1075 * @param offset the start position within the character array, must be valid. 1076 * @param count the number of characters to tokenize, must be valid. 1077 * @return the modifiable list of String tokens, unmodifiable if null array or zero count. 1078 */ 1079 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1080 if (ArrayUtils.isEmpty(srcChars)) { 1081 return Collections.emptyList(); 1082 } 1083 final StrBuilder buf = new StrBuilder(); 1084 final List<String> tokenList = new ArrayList<>(); 1085 int pos = offset; 1086 1087 // loop around the entire buffer 1088 while (pos >= 0 && pos < count) { 1089 // find next token 1090 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1091 1092 // handle case where end of string is a delimiter 1093 if (pos >= count) { 1094 addToken(tokenList, StringUtils.EMPTY); 1095 } 1096 } 1097 return tokenList; 1098 } 1099 1100 /** 1101 * Gets the String content that the tokenizer is parsing. 1102 * 1103 * @return the string content being parsed. 1104 */ 1105 @Override 1106 public String toString() { 1107 if (tokens == null) { 1108 return "StrTokenizer[not tokenized yet]"; 1109 } 1110 return "StrTokenizer" + getTokenList(); 1111 } 1112 1113}