001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.ArrayUtils; 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * Tokenizes a string based on delimiters (separators) 030 * and supporting quoting and ignored character concepts. 031 * <p> 032 * This class can split a String into many smaller strings. It aims 033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 034 * however it offers much more control and flexibility including implementing 035 * the {@code ListIterator} interface. By default, it is set up 036 * like {@code StringTokenizer}. 037 * <p> 038 * The input String is split into a number of <em>tokens</em>. 039 * Each token is separated from the next String by a <em>delimiter</em>. 040 * One or more delimiter characters must be specified. 041 * <p> 042 * Each token may be surrounded by quotes. 043 * The <em>quote</em> matcher specifies the quote character(s). 044 * A quote may be escaped within a quoted section by duplicating itself. 045 * <p> 046 * Between each token and the delimiter are potentially characters that need trimming. 047 * The <em>trimmer</em> matcher specifies these characters. 048 * One usage might be to trim whitespace characters. 049 * <p> 050 * At any point outside the quotes there might potentially be invalid characters. 051 * The <em>ignored</em> matcher specifies these characters to be removed. 052 * One usage might be to remove new line characters. 053 * <p> 054 * Empty tokens may be removed or returned as null. 055 * <pre> 056 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 057 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 059 * </pre> 060 * 061 * <table> 062 * <caption>StrTokenizer properties and options</caption> 063 * <tr> 064 * <th>Property</th><th>Type</th><th>Default</th> 065 * </tr> 066 * <tr> 067 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 068 * </tr> 069 * <tr> 070 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 071 * </tr> 072 * <tr> 073 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 074 * </tr> 075 * <tr> 076 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 077 * </tr> 078 * <tr> 079 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 080 * </tr> 081 * </table> 082 * 083 * @since 1.0 084 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0. 085 */ 086@Deprecated 087public class StrTokenizer implements ListIterator<String>, Cloneable { 088 089 /** Comma separated values tokenizer internal variable. */ 090 // @formatter:off 091 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 092 .setDelimiterMatcher(StrMatcher.commaMatcher()) 093 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 094 .setIgnoredMatcher(StrMatcher.noneMatcher()) 095 .setTrimmerMatcher(StrMatcher.trimMatcher()) 096 .setEmptyTokenAsNull(false) 097 .setIgnoreEmptyTokens(false); 098 // @formatter:on 099 100 /** Tab separated values tokenizer internal variable. */ 101 // @formatter:off 102 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 103 .setDelimiterMatcher(StrMatcher.tabMatcher()) 104 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 105 .setIgnoredMatcher(StrMatcher.noneMatcher()) 106 .setTrimmerMatcher(StrMatcher.trimMatcher()) 107 .setEmptyTokenAsNull(false) 108 .setIgnoreEmptyTokens(false); 109 // @formatter:on 110 111 /** 112 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 113 * 114 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 115 */ 116 private static StrTokenizer getCSVClone() { 117 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 118 } 119 120 /** 121 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 122 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 123 * <p> 124 * You must call a "reset" method to set the string which you want to parse. 125 * </p> 126 * 127 * @return a new tokenizer instance which parses Comma Separated Value strings. 128 */ 129 public static StrTokenizer getCSVInstance() { 130 return getCSVClone(); 131 } 132 133 /** 134 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 135 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 136 * 137 * @param input the text to parse. 138 * @return a new tokenizer instance which parses Comma Separated Value strings. 139 */ 140 public static StrTokenizer getCSVInstance(final char[] input) { 141 final StrTokenizer tok = getCSVClone(); 142 tok.reset(input); 143 return tok; 144 } 145 146 /** 147 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 148 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 149 * 150 * @param input the text to parse. 151 * @return a new tokenizer instance which parses Comma Separated Value strings. 152 */ 153 public static StrTokenizer getCSVInstance(final String input) { 154 final StrTokenizer tok = getCSVClone(); 155 tok.reset(input); 156 return tok; 157 } 158 159 /** 160 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 161 * 162 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 163 */ 164 private static StrTokenizer getTSVClone() { 165 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 166 } 167 168 /** 169 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 170 * be overridden with the setTrimmer method). 171 * <p> 172 * You must call a "reset" method to set the string which you want to parse. 173 * </p> 174 * 175 * @return a new tokenizer instance which parses Tab Separated Value strings. 176 */ 177 public static StrTokenizer getTSVInstance() { 178 return getTSVClone(); 179 } 180 181 /** 182 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 183 * be overridden with the setTrimmer method). 184 * 185 * @param input the string to parse. 186 * @return a new tokenizer instance which parses Tab Separated Value strings. 187 */ 188 public static StrTokenizer getTSVInstance(final char[] input) { 189 final StrTokenizer tok = getTSVClone(); 190 tok.reset(input); 191 return tok; 192 } 193 194 /** 195 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 196 * be overridden with the setTrimmer method). 197 * 198 * @param input the string to parse. 199 * @return a new tokenizer instance which parses Tab Separated Value strings. 200 */ 201 public static StrTokenizer getTSVInstance(final String input) { 202 final StrTokenizer tok = getTSVClone(); 203 tok.reset(input); 204 return tok; 205 } 206 207 /** The text to work on. */ 208 private char[] chars; 209 210 /** The parsed tokens. */ 211 private String[] tokens; 212 213 /** The current iteration position. */ 214 private int tokenPos; 215 216 /** The delimiter matcher. */ 217 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 218 219 /** The quote matcher. */ 220 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 221 222 /** The ignored matcher. */ 223 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 224 225 /** The trimmer matcher. */ 226 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 227 228 /** Whether to return empty tokens as null. */ 229 private boolean emptyAsNull; 230 231 /** Whether to ignore empty tokens. */ 232 private boolean ignoreEmptyTokens = true; 233 234 /** 235 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize. 236 * <p> 237 * This constructor is normally used with {@link #reset(String)}. 238 * </p> 239 */ 240 public StrTokenizer() { 241 this.chars = null; 242 } 243 244 /** 245 * Constructs a tokenizer splitting on space, tab, newline and form feed 246 * as per StringTokenizer. 247 * 248 * @param input the string which is to be parsed, not cloned. 249 */ 250 public StrTokenizer(final char[] input) { 251 if (input == null) { 252 this.chars = null; 253 } else { 254 this.chars = input.clone(); 255 } 256 } 257 258 /** 259 * Constructs a tokenizer splitting on the specified character. 260 * 261 * @param input the string which is to be parsed, not cloned. 262 * @param delim the field delimiter character. 263 */ 264 public StrTokenizer(final char[] input, final char delim) { 265 this(input); 266 setDelimiterChar(delim); 267 } 268 269 /** 270 * Constructs a tokenizer splitting on the specified delimiter character 271 * and handling quotes using the specified quote character. 272 * 273 * @param input the string which is to be parsed, not cloned. 274 * @param delim the field delimiter character. 275 * @param quote the field quoted string character. 276 */ 277 public StrTokenizer(final char[] input, final char delim, final char quote) { 278 this(input, delim); 279 setQuoteChar(quote); 280 } 281 282 /** 283 * Constructs a tokenizer splitting on the specified string. 284 * 285 * @param input the string which is to be parsed, not cloned. 286 * @param delim the field delimiter string. 287 */ 288 public StrTokenizer(final char[] input, final String delim) { 289 this(input); 290 setDelimiterString(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting using the specified delimiter matcher. 295 * 296 * @param input the string which is to be parsed, not cloned. 297 * @param delim the field delimiter matcher. 298 */ 299 public StrTokenizer(final char[] input, final StrMatcher delim) { 300 this(input); 301 setDelimiterMatcher(delim); 302 } 303 304 /** 305 * Constructs a tokenizer splitting using the specified delimiter matcher 306 * and handling quotes using the specified quote matcher. 307 * 308 * @param input the string which is to be parsed, not cloned. 309 * @param delim the field delimiter character. 310 * @param quote the field quoted string character. 311 */ 312 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 313 this(input, delim); 314 setQuoteMatcher(quote); 315 } 316 317 /** 318 * Constructs a tokenizer splitting on space, tab, newline and form feed 319 * as per StringTokenizer. 320 * 321 * @param input the string which is to be parsed. 322 */ 323 public StrTokenizer(final String input) { 324 if (input != null) { 325 chars = input.toCharArray(); 326 } else { 327 chars = null; 328 } 329 } 330 331 /** 332 * Constructs a tokenizer splitting on the specified delimiter character. 333 * 334 * @param input the string which is to be parsed. 335 * @param delim the field delimiter character. 336 */ 337 public StrTokenizer(final String input, final char delim) { 338 this(input); 339 setDelimiterChar(delim); 340 } 341 342 /** 343 * Constructs a tokenizer splitting on the specified delimiter character 344 * and handling quotes using the specified quote character. 345 * 346 * @param input the string which is to be parsed. 347 * @param delim the field delimiter character. 348 * @param quote the field quoted string character. 349 */ 350 public StrTokenizer(final String input, final char delim, final char quote) { 351 this(input, delim); 352 setQuoteChar(quote); 353 } 354 355 /** 356 * Constructs a tokenizer splitting on the specified delimiter string. 357 * 358 * @param input the string which is to be parsed. 359 * @param delim the field delimiter string. 360 */ 361 public StrTokenizer(final String input, final String delim) { 362 this(input); 363 setDelimiterString(delim); 364 } 365 366 /** 367 * Constructs a tokenizer splitting using the specified delimiter matcher. 368 * 369 * @param input the string which is to be parsed. 370 * @param delim the field delimiter matcher. 371 */ 372 public StrTokenizer(final String input, final StrMatcher delim) { 373 this(input); 374 setDelimiterMatcher(delim); 375 } 376 377 /** 378 * Constructs a tokenizer splitting using the specified delimiter matcher 379 * and handling quotes using the specified quote matcher. 380 * 381 * @param input the string which is to be parsed. 382 * @param delim the field delimiter matcher. 383 * @param quote the field quoted string matcher. 384 */ 385 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 386 this(input, delim); 387 setQuoteMatcher(quote); 388 } 389 390 /** 391 * Unsupported ListIterator operation. 392 * @param obj this parameter ignored. 393 * @throws UnsupportedOperationException always. 394 */ 395 @Override 396 public void add(final String obj) { 397 throw new UnsupportedOperationException("add() is unsupported"); 398 } 399 400 /** 401 * Adds a token to a list, paying attention to the parameters we've set. 402 * 403 * @param list the list to add to. 404 * @param tok the token to add. 405 */ 406 private void addToken(final List<String> list, String tok) { 407 if (tok == null || tok.isEmpty()) { 408 if (isIgnoreEmptyTokens()) { 409 return; 410 } 411 if (isEmptyTokenAsNull()) { 412 tok = null; 413 } 414 } 415 list.add(tok); 416 } 417 418 /** 419 * Checks if tokenization has been done, and if not then do it. 420 */ 421 private void checkTokenized() { 422 if (tokens == null) { 423 if (chars == null) { 424 // still call tokenize as subclass may do some work 425 final List<String> split = tokenize(null, 0, 0); 426 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 427 } else { 428 final List<String> split = tokenize(chars, 0, chars.length); 429 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 430 } 431 } 432 } 433 434 /** 435 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a 436 * {@link CloneNotSupportedException} is caught, return {@code null}. 437 * 438 * @return a new instance of this Tokenizer which has been reset. 439 */ 440 @Override 441 public Object clone() { 442 try { 443 return cloneReset(); 444 } catch (final CloneNotSupportedException ex) { 445 return null; 446 } 447 } 448 449 /** 450 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. 451 * 452 * @return a new instance of this Tokenizer which has been reset. 453 * @throws CloneNotSupportedException if there is a problem cloning. 454 */ 455 Object cloneReset() throws CloneNotSupportedException { 456 // this method exists to enable 100% test coverage 457 final StrTokenizer cloned = (StrTokenizer) super.clone(); 458 if (cloned.chars != null) { 459 cloned.chars = cloned.chars.clone(); 460 } 461 cloned.reset(); 462 return cloned; 463 } 464 465 /** 466 * Gets the String content that the tokenizer is parsing. 467 * 468 * @return The string content being parsed. 469 */ 470 public String getContent() { 471 if (chars == null) { 472 return null; 473 } 474 return new String(chars); 475 } 476 477 /** 478 * Gets the field delimiter matcher. 479 * 480 * @return The delimiter matcher in use. 481 */ 482 public StrMatcher getDelimiterMatcher() { 483 return this.delimMatcher; 484 } 485 486 /** 487 * Gets the ignored character matcher. 488 * <p> 489 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything. 490 * </p> 491 * 492 * @return The ignored matcher in use. 493 */ 494 public StrMatcher getIgnoredMatcher() { 495 return ignoredMatcher; 496 } 497 498 /** 499 * Gets the quote matcher currently in use. 500 * <p> 501 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote). 502 * </p> 503 * 504 * @return The quote matcher in use. 505 */ 506 public StrMatcher getQuoteMatcher() { 507 return quoteMatcher; 508 } 509 510 /** 511 * Gets a copy of the full token list as an independent modifiable array. 512 * 513 * @return The tokens as a String array. 514 */ 515 public String[] getTokenArray() { 516 checkTokenized(); 517 return tokens.clone(); 518 } 519 520 /** 521 * Gets a copy of the full token list as an independent modifiable list. 522 * 523 * @return The tokens as a String array. 524 */ 525 public List<String> getTokenList() { 526 checkTokenized(); 527 final List<String> list = new ArrayList<>(tokens.length); 528 Collections.addAll(list, tokens); 529 530 return list; 531 } 532 533 /** 534 * Gets the trimmer character matcher. 535 * <p> 536 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything. 537 * </p> 538 * 539 * @return The trimmer matcher in use. 540 */ 541 public StrMatcher getTrimmerMatcher() { 542 return trimmerMatcher; 543 } 544 545 /** 546 * Checks whether there are any more tokens. 547 * 548 * @return true if there are more tokens. 549 */ 550 @Override 551 public boolean hasNext() { 552 checkTokenized(); 553 return tokenPos < tokens.length; 554 } 555 556 /** 557 * Checks whether there are any previous tokens that can be iterated to. 558 * 559 * @return true if there are previous tokens. 560 */ 561 @Override 562 public boolean hasPrevious() { 563 checkTokenized(); 564 return tokenPos > 0; 565 } 566 567 /** 568 * Gets whether the tokenizer currently returns empty tokens as null. 569 * The default for this property is false. 570 * 571 * @return true if empty tokens are returned as null. 572 */ 573 public boolean isEmptyTokenAsNull() { 574 return this.emptyAsNull; 575 } 576 577 /** 578 * Gets whether the tokenizer currently ignores empty tokens. 579 * The default for this property is true. 580 * 581 * @return true if empty tokens are not returned. 582 */ 583 public boolean isIgnoreEmptyTokens() { 584 return ignoreEmptyTokens; 585 } 586 587 /** 588 * Checks if the characters at the index specified match the quote 589 * already matched in readNextToken(). 590 * 591 * @param srcChars the character array being tokenized. 592 * @param pos the position to check for a quote. 593 * @param len the length of the character array being tokenized. 594 * @param quoteStart the start position of the matched quote, 0 if no quoting. 595 * @param quoteLen the length of the matched quote, 0 if no quoting. 596 * @return true if a quote is matched. 597 */ 598 private boolean isQuote(final char[] srcChars, 599 final int pos, 600 final int len, 601 final int quoteStart, 602 final int quoteLen) { 603 for (int i = 0; i < quoteLen; i++) { 604 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 605 return false; 606 } 607 } 608 return true; 609 } 610 611 /** 612 * Gets the next token. 613 * 614 * @return The next String token. 615 * @throws NoSuchElementException if there are no more elements. 616 */ 617 @Override 618 public String next() { 619 if (hasNext()) { 620 return tokens[tokenPos++]; 621 } 622 throw new NoSuchElementException(); 623 } 624 625 /** 626 * Gets the index of the next token to return. 627 * 628 * @return The next token index. 629 */ 630 @Override 631 public int nextIndex() { 632 return tokenPos; 633 } 634 635 /** 636 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no 637 * tokens remain. 638 * 639 * @return The next sequential token, or null when no more tokens are found. 640 */ 641 public String nextToken() { 642 if (hasNext()) { 643 return tokens[tokenPos++]; 644 } 645 return null; 646 } 647 648 /** 649 * Gets the token previous to the last returned token. 650 * 651 * @return The previous token. 652 */ 653 @Override 654 public String previous() { 655 if (hasPrevious()) { 656 return tokens[--tokenPos]; 657 } 658 throw new NoSuchElementException(); 659 } 660 661 /** 662 * Gets the index of the previous token. 663 * 664 * @return The previous token index. 665 */ 666 @Override 667 public int previousIndex() { 668 return tokenPos - 1; 669 } 670 671 /** 672 * Gets the previous token from the String. 673 * 674 * @return The previous sequential token, or null when no more tokens are found. 675 */ 676 public String previousToken() { 677 if (hasPrevious()) { 678 return tokens[--tokenPos]; 679 } 680 return null; 681 } 682 683 /** 684 * Reads character by character through the String to get the next token. 685 * 686 * @param srcChars the character array being tokenized. 687 * @param start the first character of field. 688 * @param len the length of the character array being tokenized. 689 * @param workArea a temporary work area. 690 * @param tokenList the list of parsed tokens. 691 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found. 692 */ 693 private int readNextToken(final char[] srcChars, 694 int start, 695 final int len, 696 final StrBuilder workArea, 697 final List<String> tokenList) { 698 // skip all leading whitespace, unless it is the 699 // field delimiter or the quote character 700 while (start < len) { 701 final int removeLen = Math.max( 702 getIgnoredMatcher().isMatch(srcChars, start, start, len), 703 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 704 if (removeLen == 0 705 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 706 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 707 break; 708 } 709 start += removeLen; 710 } 711 712 // handle reaching end 713 if (start >= len) { 714 addToken(tokenList, StringUtils.EMPTY); 715 return -1; 716 } 717 718 // handle empty token 719 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 720 if (delimLen > 0) { 721 addToken(tokenList, StringUtils.EMPTY); 722 return start + delimLen; 723 } 724 725 // handle found token 726 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 727 if (quoteLen > 0) { 728 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 729 } 730 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 731 } 732 733 /** 734 * Reads a possibly quoted string token. 735 * 736 * @param srcChars the character array being tokenized. 737 * @param start the first character of field. 738 * @param len the length of the character array being tokenized. 739 * @param workArea a temporary work area. 740 * @param tokenList the list of parsed tokens. 741 * @param quoteStart the start position of the matched quote, 0 if no quoting. 742 * @param quoteLen the length of the matched quote, 0 if no quoting. 743 * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string. 744 */ 745 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 746 final List<String> tokenList, final int quoteStart, final int quoteLen) { 747 // Loop until we've found the end of the quoted 748 // string or the end of the input 749 workArea.clear(); 750 int pos = start; 751 boolean quoting = quoteLen > 0; 752 int trimStart = 0; 753 754 while (pos < len) { 755 // quoting mode can occur several times throughout a string 756 // we must switch between quoting and non-quoting until we 757 // encounter a non-quoted delimiter, or end of string 758 if (quoting) { 759 // In quoting mode 760 761 // If we've found a quote character, see if it's 762 // followed by a second quote. If so, then we need 763 // to actually put the quote character into the token 764 // rather than end the token. 765 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 766 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 767 // matched pair of quotes, thus an escaped quote 768 workArea.append(srcChars, pos, quoteLen); 769 pos += quoteLen * 2; 770 trimStart = workArea.size(); 771 continue; 772 } 773 774 // end of quoting 775 quoting = false; 776 pos += quoteLen; 777 continue; 778 } 779 780 } else { 781 // Not in quoting mode 782 783 // check for delimiter, and thus end of token 784 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 785 if (delimLen > 0) { 786 // return condition when end of token found 787 addToken(tokenList, workArea.substring(0, trimStart)); 788 return pos + delimLen; 789 } 790 791 // check for quote, and thus back into quoting mode 792 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 793 quoting = true; 794 pos += quoteLen; 795 continue; 796 } 797 798 // check for ignored (outside quotes), and ignore 799 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 800 if (ignoredLen > 0) { 801 pos += ignoredLen; 802 continue; 803 } 804 805 // check for trimmed character 806 // don't yet know if its at the end, so copy to workArea 807 // use trimStart to keep track of trim at the end 808 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 809 if (trimmedLen > 0) { 810 workArea.append(srcChars, pos, trimmedLen); 811 pos += trimmedLen; 812 continue; 813 } 814 815 } 816 // copy regular character from inside quotes 817 workArea.append(srcChars[pos++]); 818 trimStart = workArea.size(); 819 } 820 821 // return condition when end of string found 822 addToken(tokenList, workArea.substring(0, trimStart)); 823 return -1; 824 } 825 826 /** 827 * Unsupported ListIterator operation. 828 * 829 * @throws UnsupportedOperationException always. 830 */ 831 @Override 832 public void remove() { 833 throw new UnsupportedOperationException("remove() is unsupported"); 834 } 835 836 /** 837 * Resets this tokenizer, forgetting all parsing and iteration already completed. 838 * <p> 839 * This method allows the same tokenizer to be reused for the same String. 840 * </p> 841 * 842 * @return {@code this} instance. 843 */ 844 public StrTokenizer reset() { 845 tokenPos = 0; 846 tokens = null; 847 return this; 848 } 849 850 /** 851 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines. 852 * 853 * @param input the new character array to tokenize, not cloned, null sets no text to parse. 854 * @return {@code this} instance. 855 */ 856 public StrTokenizer reset(final char[] input) { 857 reset(); 858 if (input != null) { 859 this.chars = input.clone(); 860 } else { 861 this.chars = null; 862 } 863 return this; 864 } 865 866 /** 867 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines. 868 * 869 * @param input the new string to tokenize, null sets no text to parse. 870 * @return {@code this} instance. 871 */ 872 public StrTokenizer reset(final String input) { 873 reset(); 874 if (input != null) { 875 this.chars = input.toCharArray(); 876 } else { 877 this.chars = null; 878 } 879 return this; 880 } 881 882 /** 883 * Unsupported ListIterator operation. 884 * 885 * @param obj this parameter ignored. 886 * @throws UnsupportedOperationException Always thrown. 887 */ 888 @Override 889 public void set(final String obj) { 890 throw new UnsupportedOperationException("set() is unsupported"); 891 } 892 893 /** 894 * Sets the field delimiter character. 895 * 896 * @param delim the delimiter character to use. 897 * @return {@code this} instance. 898 */ 899 public StrTokenizer setDelimiterChar(final char delim) { 900 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 901 } 902 903 /** 904 * Sets the field delimiter matcher. 905 * <p> 906 * The delimiter is used to separate one token from another. 907 * </p> 908 * 909 * @param delim the delimiter matcher to use. 910 * @return {@code this} instance. 911 */ 912 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 913 if (delim == null) { 914 this.delimMatcher = StrMatcher.noneMatcher(); 915 } else { 916 this.delimMatcher = delim; 917 } 918 return this; 919 } 920 921 /** 922 * Sets the field delimiter string. 923 * 924 * @param delim the delimiter string to use. 925 * @return {@code this} instance. 926 */ 927 public StrTokenizer setDelimiterString(final String delim) { 928 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 929 } 930 931 /** 932 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 933 * 934 * @param emptyAsNull whether empty tokens are returned as null. 935 * @return {@code this} instance. 936 */ 937 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 938 this.emptyAsNull = emptyAsNull; 939 return this; 940 } 941 942 /** 943 * Sets the character to ignore. 944 * <p> 945 * This character is ignored when parsing the String, unless it is within a quoted region. 946 * </p> 947 * 948 * @param ignored the ignored character to use. 949 * @return {@code this} instance. 950 */ 951 public StrTokenizer setIgnoredChar(final char ignored) { 952 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 953 } 954 955 /** 956 * Sets the matcher for characters to ignore. 957 * <p> 958 * These characters are ignored when parsing the String, unless they are within a quoted region. 959 * </p> 960 * 961 * @param ignored the ignored matcher to use, null ignored. 962 * @return {@code this} instance. 963 */ 964 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 965 if (ignored != null) { 966 this.ignoredMatcher = ignored; 967 } 968 return this; 969 } 970 971 /** 972 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 973 * 974 * @param ignoreEmptyTokens whether empty tokens are not returned. 975 * @return {@code this} instance. 976 */ 977 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 978 this.ignoreEmptyTokens = ignoreEmptyTokens; 979 return this; 980 } 981 982 /** 983 * Sets the quote character to use. 984 * <p> 985 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 986 * </p> 987 * 988 * @param quote the quote character to use. 989 * @return {@code this} instance. 990 */ 991 public StrTokenizer setQuoteChar(final char quote) { 992 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 993 } 994 995 /** 996 * Sets the quote matcher to use. 997 * <p> 998 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 999 * </p> 1000 * 1001 * @param quote the quote matcher to use, null ignored. 1002 * @return {@code this} instance. 1003 */ 1004 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1005 if (quote != null) { 1006 this.quoteMatcher = quote; 1007 } 1008 return this; 1009 } 1010 1011 /** 1012 * Sets the matcher for characters to trim. 1013 * <p> 1014 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1015 * </p> 1016 * 1017 * @param trimmer the trimmer matcher to use, null ignored 1018 * @return {@code this} instance. 1019 */ 1020 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1021 if (trimmer != null) { 1022 this.trimmerMatcher = trimmer; 1023 } 1024 return this; 1025 } 1026 1027 /** 1028 * Gets the number of tokens found in the String. 1029 * 1030 * @return The number of matched tokens. 1031 */ 1032 public int size() { 1033 checkTokenized(); 1034 return tokens.length; 1035 } 1036 1037 /** 1038 * Internal method to performs the tokenization. 1039 * <p> 1040 * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required. 1041 * </p> 1042 * <p> 1043 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or 1044 * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results. 1045 * </p> 1046 * <p> 1047 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other 1048 * values, or even an entirely different array. 1049 * </p> 1050 * 1051 * @param srcChars the character array being tokenized, may be null. 1052 * @param offset the start position within the character array, must be valid. 1053 * @param count the number of characters to tokenize, must be valid. 1054 * @return The modifiable list of String tokens, unmodifiable if null array or zero count. 1055 */ 1056 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1057 if (srcChars == null || count == 0) { 1058 return Collections.emptyList(); 1059 } 1060 final StrBuilder buf = new StrBuilder(); 1061 final List<String> tokenList = new ArrayList<>(); 1062 int pos = offset; 1063 1064 // loop around the entire buffer 1065 while (pos >= 0 && pos < count) { 1066 // find next token 1067 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1068 1069 // handle case where end of string is a delimiter 1070 if (pos >= count) { 1071 addToken(tokenList, StringUtils.EMPTY); 1072 } 1073 } 1074 return tokenList; 1075 } 1076 1077 /** 1078 * Gets the String content that the tokenizer is parsing. 1079 * 1080 * @return The string content being parsed. 1081 */ 1082 @Override 1083 public String toString() { 1084 if (tokens == null) { 1085 return "StrTokenizer[not tokenized yet]"; 1086 } 1087 return "StrTokenizer" + getTokenList(); 1088 } 1089 1090}